xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 61a6e5df8a56e9f4635d65196feaa1303d4e372f)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
226     CHKERRCXX(aijkok->a_dual.sync_device());
227     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
228     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
229    #if defined(PETSC_USE_COMPLEX)
230     const auto& a = aijkok->csrmatH.values;
231     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
232    #endif
233     aijkok->hermitian_updated = PETSC_TRUE;
234   }
235   *csrmatH = &aijkok->csrmatH;
236   PetscFunctionReturn(0);
237 }
238 
239 /* y = A x */
240 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
241 {
242   PetscErrorCode                   ierr;
243   Mat_SeqAIJKokkos                 *aijkok;
244   ConstPetscScalarKokkosView       xv;
245   PetscScalarKokkosView            yv;
246 
247   PetscFunctionBegin;
248   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
249   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
250   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
251   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
252   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
253   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
255   ierr   = WaitForKokkos();CHKERRQ(ierr);
256   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
257   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
261 /* y = A^T x */
262 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
263 {
264   PetscErrorCode                   ierr;
265   Mat_SeqAIJKokkos                 *aijkok;
266   const char                       *mode;
267   ConstPetscScalarKokkosView       xv;
268   PetscScalarKokkosView            yv;
269   KokkosCsrMatrix                  *csrmat;
270 
271   PetscFunctionBegin;
272   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
273   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
274   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
275   if (A->form_explicit_transpose) {
276     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
277     mode = "N";
278   } else {
279     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
280     csrmat = &aijkok->csrmat;
281     mode = "T";
282   }
283   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
284   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
285   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
286   ierr = WaitForKokkos();CHKERRQ(ierr);
287   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
288   PetscFunctionReturn(0);
289 }
290 
291 /* y = A^H x */
292 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
293 {
294   PetscErrorCode                   ierr;
295   Mat_SeqAIJKokkos                 *aijkok;
296   const char                       *mode;
297   ConstPetscScalarKokkosView       xv;
298   PetscScalarKokkosView            yv;
299   KokkosCsrMatrix                  *csrmat;
300 
301   PetscFunctionBegin;
302   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
303   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
304   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
305   if (A->form_explicit_transpose) {
306     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
307     mode = "N";
308   } else {
309     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
310     csrmat = &aijkok->csrmat;
311     mode = "C";
312   }
313   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
314   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
315   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
316   ierr = WaitForKokkos();CHKERRQ(ierr);
317   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
321 /* z = A x + y */
322 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
323 {
324   PetscErrorCode                   ierr;
325   Mat_SeqAIJKokkos                 *aijkok;
326   ConstPetscScalarKokkosView       xv,yv;
327   PetscScalarKokkosView            zv;
328 
329   PetscFunctionBegin;
330   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
331   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
332   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
333   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
334   if (zz != yy) Kokkos::deep_copy(zv,yv);
335   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
336   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
337   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
340   ierr = WaitForKokkos();CHKERRQ(ierr);
341   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
342   PetscFunctionReturn(0);
343 }
344 
345 /* z = A^T x + y */
346 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
347 {
348   PetscErrorCode                   ierr;
349   Mat_SeqAIJKokkos                 *aijkok;
350   const char                       *mode;
351   ConstPetscScalarKokkosView       xv,yv;
352   PetscScalarKokkosView            zv;
353   KokkosCsrMatrix                  *csrmat;
354 
355   PetscFunctionBegin;
356   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
357   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
358   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
359   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
360   if (zz != yy) Kokkos::deep_copy(zv,yv);
361   if (A->form_explicit_transpose) {
362     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
363     mode = "N";
364   } else {
365     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
366     csrmat = &aijkok->csrmat;
367     mode = "T";
368   }
369   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
370   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
371   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
372   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
373   ierr = WaitForKokkos();CHKERRQ(ierr);
374   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 /* z = A^H x + y */
379 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
380 {
381   PetscErrorCode                   ierr;
382   Mat_SeqAIJKokkos                 *aijkok;
383   const char                       *mode;
384   ConstPetscScalarKokkosView       xv,yv;
385   PetscScalarKokkosView            zv;
386   KokkosCsrMatrix                  *csrmat;
387 
388   PetscFunctionBegin;
389   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
390   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
391   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
392   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
393   if (zz != yy) Kokkos::deep_copy(zv,yv);
394   if (A->form_explicit_transpose) {
395     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
396     mode = "N";
397   } else {
398     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
399     csrmat = &aijkok->csrmat;
400     mode = "C";
401   }
402   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
403   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
404   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
405   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
406   ierr = WaitForKokkos();CHKERRQ(ierr);
407   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
408   PetscFunctionReturn(0);
409 }
410 
411 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
412 {
413   PetscErrorCode            ierr;
414   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
415 
416   PetscFunctionBegin;
417   switch (op) {
418     case MAT_FORM_EXPLICIT_TRANSPOSE:
419       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
420       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
421       A->form_explicit_transpose = flg;
422       break;
423     default:
424       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
425       break;
426   }
427   PetscFunctionReturn(0);
428 }
429 
430 /* Depending on reuse, either build a new mat, or use the existing mat */
431 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
432 {
433   PetscErrorCode   ierr;
434   Mat_SeqAIJ       *aseq;
435 
436   PetscFunctionBegin;
437   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
438   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
439     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
440   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
441     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
442   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
443     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
444     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
445     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
446     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
447     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
448     aseq = static_cast<Mat_SeqAIJ*>(A->data);
449     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
450       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
451       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
452     }
453   }
454   PetscFunctionReturn(0);
455 }
456 
457 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
458    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
459  */
460 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
461 {
462   PetscErrorCode        ierr;
463   Mat_SeqAIJ            *bseq;
464   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
465   Mat                   mat;
466 
467   PetscFunctionBegin;
468   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
469   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
470   mat  = *B;
471   if (A->assembled) {
472     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
473     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
474     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
475     /* Now copy values to B if needed */
476     if (dupOption == MAT_COPY_VALUES) {
477       if (akok->a_dual.need_sync_device()) {
478         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
479         bkok->a_dual.modify_host();
480       } else { /* If device has the latest data, we only copy data on device */
481         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
482         bkok->a_dual.modify_device();
483       }
484     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
485       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
486       bkok->a_dual.modify_host();
487     }
488     mat->spptr = bkok;
489   }
490 
491   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
492   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
493   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
494   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
499 {
500   PetscErrorCode    ierr;
501   Mat               At;
502   KokkosCsrMatrix   *internT,*csrmatT;
503   Mat_SeqAIJKokkos  *atkok,*bkok;
504 
505   PetscFunctionBegin;
506   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
507   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
508     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
509     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
510     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
511     if (reuse == MAT_INITIAL_MATRIX) *B = At;
512     else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
513   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
514     if ((*B)->assembled) {
515       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
516       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
517       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
518     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
519       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
520       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
521       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
522       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
523       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
524     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
525   }
526   PetscFunctionReturn(0);
527 }
528 
529 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
530 {
531   PetscErrorCode             ierr;
532   Mat_SeqAIJKokkos           *aijkok;
533 
534   PetscFunctionBegin;
535   if (A->factortype == MAT_FACTOR_NONE) {
536     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537     if (aijkok) {
538       if (aijkok->device_mat_d.data()) {
539         delete aijkok->colmap_d;
540         delete aijkok->i_uncompressed_d;
541       }
542       if (aijkok->diag_d) delete aijkok->diag_d;
543     }
544     delete aijkok;
545   } else {
546     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
547   }
548   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
549   A->spptr = NULL;
550   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 
554 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
555 {
556   PetscErrorCode ierr;
557 
558   PetscFunctionBegin;
559   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
560   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
561   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
565 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
566 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
567 {
568   PetscErrorCode               ierr;
569   Mat_SeqAIJ                   *a,*b;
570   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
571   MatScalarKokkosView          aa,ba,ca;
572   MatRowMapKokkosView          ai,bi,ci;
573   MatColIdxKokkosView          aj,bj,cj;
574   PetscInt                     m,n,nnz,aN;
575 
576   PetscFunctionBegin;
577   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
578   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
579   PetscValidPointer(C,4);
580   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
581   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
582   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
583   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
584 
585   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
586   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
587   a    = static_cast<Mat_SeqAIJ*>(A->data);
588   b    = static_cast<Mat_SeqAIJ*>(B->data);
589   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
590   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
591   aa   = akok->a_dual.view_device();
592   ai   = akok->i_dual.view_device();
593   ba   = bkok->a_dual.view_device();
594   bi   = bkok->i_dual.view_device();
595   m    = A->rmap->n; /* M, N and nnz of C */
596   n    = A->cmap->n + B->cmap->n;
597   nnz  = a->nz + b->nz;
598   aN   = A->cmap->n; /* N of A */
599   if (reuse == MAT_INITIAL_MATRIX) {
600     aj = akok->j_dual.view_device();
601     bj = bkok->j_dual.view_device();
602     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
603     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
604     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
605     ca = ca_dual.view_device();
606     ci = ci_dual.view_device();
607     cj = cj_dual.view_device();
608 
609     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
610     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
611       PetscInt i = t.league_rank(); /* row i */
612       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
613 
614       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
615         ci(i) = coffset;
616         if (i == m-1) ci(m) = ai(m) + bi(m);
617       });
618 
619       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
620         if (k < alen) {
621           ca(coffset+k) = aa(ai(i)+k);
622           cj(coffset+k) = aj(ai(i)+k);
623         } else {
624           ca(coffset+k) = ba(bi(i)+k-alen);
625           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
626         }
627       });
628     });
629     ca_dual.modify_device();
630     ci_dual.modify_device();
631     cj_dual.modify_device();
632     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
633     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
634   } else if (reuse == MAT_REUSE_MATRIX) {
635     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
636     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
637     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
638     ca   = ckok->a_dual.view_device();
639     ci   = ckok->i_dual.view_device();
640 
641     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
642       PetscInt i = t.league_rank(); /* row i */
643       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
644       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
645         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
646         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
647       });
648     });
649     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
650   }
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
655 {
656   PetscFunctionBegin;
657   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
662 {
663   PetscErrorCode                 ierr;
664   Mat_Product                    *product = C->product;
665   Mat                            A,B;
666   bool                           transA,transB; /* use bool, since KK needs this type */
667   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
668   Mat_SeqAIJ                     *c;
669   MatProductData_SeqAIJKokkos    *pdata;
670   KokkosCsrMatrix                *csrmatA,*csrmatB;
671 
672   PetscFunctionBegin;
673   MatCheckProduct(C,1);
674   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
675   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
676 
677   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
678     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
679     PetscFunctionReturn(0);
680   }
681 
682   switch (product->type) {
683     case MATPRODUCT_AB:  transA = false; transB = false; break;
684     case MATPRODUCT_AtB: transA = true;  transB = false; break;
685     case MATPRODUCT_ABt: transA = false; transB = true;  break;
686     default:
687       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
688   }
689 
690   A     = product->A;
691   B     = product->B;
692   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
693   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
694   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
695   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
696   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
697 
698   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
699 
700   csrmatA = &akok->csrmat;
701   csrmatB = &bkok->csrmat;
702 
703   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
704   if (transA) {
705     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
706     transA = false;
707   }
708 
709   if (transB) {
710     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
711     transB = false;
712   }
713 
714   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
715   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
716   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
717   /* shorter version of MatAssemblyEnd_SeqAIJ */
718   c = (Mat_SeqAIJ*)C->data;
719   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
720   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
721   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
722   c->reallocs         = 0;
723   C->info.mallocs     = 0;
724   C->info.nz_unneeded = 0;
725   C->assembled        = C->was_assembled = PETSC_TRUE;
726   C->num_ass++;
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
731 {
732   PetscErrorCode                 ierr;
733   Mat_Product                    *product = C->product;
734   MatProductType                 ptype;
735   Mat                            A,B;
736   bool                           transA,transB;
737   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
738   MatProductData_SeqAIJKokkos    *pdata;
739   MPI_Comm                       comm;
740   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
741 
742   PetscFunctionBegin;
743   MatCheckProduct(C,1);
744   ierr = PetscObjectGetComm((PetscObject)C,&comm);
745   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
746   A       = product->A;
747   B       = product->B;
748   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
749   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
750   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
751   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
752   csrmatA = &akok->csrmat;
753   csrmatB = &bkok->csrmat;
754 
755   ptype   = product->type;
756   switch (ptype) {
757     case MATPRODUCT_AB:  transA = false; transB = false; break;
758     case MATPRODUCT_AtB: transA = true;  transB = false; break;
759     case MATPRODUCT_ABt: transA = false; transB = true;  break;
760     default:
761       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
762   }
763 
764   product->data = pdata = new MatProductData_SeqAIJKokkos();
765   pdata->kh.set_team_work_size(16);
766   pdata->kh.set_dynamic_scheduling(true);
767   pdata->reusesym = product->api_user;
768 
769   /* TODO: add command line options to select spgemm algorithms */
770   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
771 #if defined(PETSC_HAVE_CUDA)
772   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
774     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
775   #endif
776 #endif
777   pdata->kh.create_spgemm_handle(spgemm_alg);
778 
779   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
780   if (transA) {
781     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
782     transA = false;
783   }
784 
785   if (transB) {
786     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
787     transB = false;
788   }
789 
790   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
792     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
793     calling new Mat_SeqAIJKokkos().
794     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
795   */
796   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
797   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
798 
799   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
800   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
801   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
802   PetscFunctionReturn(0);
803 }
804 
805 /* handles sparse matrix matrix ops */
806 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
807 {
808   PetscErrorCode ierr;
809   Mat_Product    *product = mat->product;
810   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
811 
812   PetscFunctionBegin;
813   MatCheckProduct(mat,1);
814   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
815   if (product->type == MATPRODUCT_ABC) {
816     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
817   }
818   if (Biskok && Ciskok) {
819     switch (product->type) {
820     case MATPRODUCT_AB:
821     case MATPRODUCT_AtB:
822     case MATPRODUCT_ABt:
823       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
824       break;
825     case MATPRODUCT_PtAP:
826     case MATPRODUCT_RARt:
827     case MATPRODUCT_ABC:
828       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
829       break;
830     default:
831       break;
832     }
833   } else { /* fallback for AIJ */
834     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
835   }
836   PetscFunctionReturn(0);
837 }
838 
839 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
840 {
841   PetscErrorCode   ierr;
842   Mat_SeqAIJKokkos *aijkok;
843 
844   PetscFunctionBegin;
845   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
846   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
847   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
848   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
849   ierr = WaitForKokkos();CHKERRQ(ierr);
850   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
851   PetscFunctionReturn(0);
852 }
853 
854 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
855 {
856   PetscErrorCode   ierr;
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
861   if (aijkok) { /* Only zero the device if data is already there */
862     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
863     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
864   } else { /* Might be preallocated but not assembled */
865     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
866   }
867   PetscFunctionReturn(0);
868 }
869 
870 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
871 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
872 {
873   PetscErrorCode     ierr;
874   Mat_SeqAIJKokkos   *aijkok;
875 
876   PetscFunctionBegin;
877   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
878   PetscValidPointer(kv,2);
879   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
880   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
881   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
882   *kv    = aijkok->a_dual.view_device();
883   PetscFunctionReturn(0);
884 }
885 
886 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
887 {
888   PetscFunctionBegin;
889   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890   PetscValidPointer(kv,2);
891   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
896 {
897   PetscErrorCode     ierr;
898   Mat_SeqAIJKokkos   *aijkok;
899 
900   PetscFunctionBegin;
901   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902   PetscValidPointer(kv,2);
903   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
905   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
906   *kv    = aijkok->a_dual.view_device();
907   PetscFunctionReturn(0);
908 }
909 
910 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
911 {
912   PetscErrorCode     ierr;
913 
914   PetscFunctionBegin;
915   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
916   PetscValidPointer(kv,2);
917   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
918   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
923 {
924   Mat_SeqAIJKokkos   *aijkok;
925 
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931   *kv    = aijkok->a_dual.view_device();
932   PetscFunctionReturn(0);
933 }
934 
935 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
936 {
937   PetscErrorCode     ierr;
938 
939   PetscFunctionBegin;
940   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
941   PetscValidPointer(kv,2);
942   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
943   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
944   PetscFunctionReturn(0);
945 }
946 
947 /* Computes Y += alpha X */
948 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
949 {
950   PetscErrorCode             ierr;
951   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
952   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
953   ConstMatScalarKokkosView   Xa;
954   MatScalarKokkosView        Ya;
955 
956   PetscFunctionBegin;
957   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
958   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
959   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
960   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
961 
962   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
963     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
964     PetscBool e;
965     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
966     if (e) {
967       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
968       if (e) pattern = SAME_NONZERO_PATTERN;
969     }
970   }
971 
972   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
973     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
974     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
975     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
976   */
977   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
978   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
979   Xa   = xkok->a_dual.view_device();
980   Ya   = ykok->a_dual.view_device();
981 
982   if (pattern == SAME_NONZERO_PATTERN) {
983     KokkosBlas::axpy(alpha,Xa,Ya);
984     ierr = MatSeqAIJKokkosModifyDevice(Y);
985   } else if (pattern == SUBSET_NONZERO_PATTERN) {
986     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
987     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
988 
989     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
990       PetscInt i = t.league_rank(); /* row i */
991       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
992         PetscInt p,q = Yi(i);
993         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
994           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
995           if (Xj(p) == Yj(q)) { /* Found it */
996             Ya(q) += alpha * Xa(p);
997             q++;
998           } else {
999             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1000                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1001             */
1002             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1003           }
1004         }
1005       });
1006     });
1007     ierr = MatSeqAIJKokkosModifyDevice(Y);
1008   } else { /* different nonzero patterns */
1009     Mat             Z;
1010     KokkosCsrMatrix zcsr;
1011     KernelHandle    kh;
1012     kh.create_spadd_handle(false);
1013     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1014     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1015     zkok = new Mat_SeqAIJKokkos(zcsr);
1016     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1017     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1018     kh.destroy_spadd_handle();
1019   }
1020 
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1025 {
1026   PetscErrorCode ierr;
1027 
1028   PetscFunctionBegin;
1029   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1030   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1031   B->offloadmask = PETSC_OFFLOAD_CPU;
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1036 {
1037   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1038 
1039   PetscFunctionBegin;
1040   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1041   A->boundtocpu  = PETSC_FALSE;
1042 
1043   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1044   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1045   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1046   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1047   A->ops->scale                     = MatScale_SeqAIJKokkos;
1048   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1049   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1050   A->ops->mult                      = MatMult_SeqAIJKokkos;
1051   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1052   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1053   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1054   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1055   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1056   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1057   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1058   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1059   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1060   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1061   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1062   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1063   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1064   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1065   PetscFunctionReturn(0);
1066 }
1067 
1068 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1069 {
1070   PetscErrorCode     ierr;
1071   Mat_SeqAIJ         *aseq;
1072   PetscInt           i,m,n;
1073 
1074   PetscFunctionBegin;
1075   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1076 
1077   m    = akok->nrows();
1078   n    = akok->ncols();
1079   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1080   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1081 
1082   /* Set up data structures of A as a MATSEQAIJ */
1083   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1084   aseq = (Mat_SeqAIJ*)(A)->data;
1085 
1086   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1087   akok->j_dual.sync_host();
1088 
1089   aseq->i            = akok->i_host_data();
1090   aseq->j            = akok->j_host_data();
1091   aseq->a            = akok->a_host_data();
1092   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1093   aseq->singlemalloc = PETSC_FALSE;
1094   aseq->free_a       = PETSC_FALSE;
1095   aseq->free_ij      = PETSC_FALSE;
1096   aseq->nz           = akok->nnz();
1097   aseq->maxnz        = aseq->nz;
1098 
1099   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1100   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1101   for (i=0; i<m; i++) {
1102     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1103   }
1104 
1105   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1106   akok->nonzerostate = A->nonzerostate;
1107   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1108   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1109   A->spptr = akok;
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1114 
1115    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1116  */
1117 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1118 {
1119   PetscErrorCode     ierr;
1120 
1121   PetscFunctionBegin;
1122   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1123   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1124   PetscFunctionReturn(0);
1125 }
1126 
1127 /* --------------------------------------------------------------------------------*/
1128 /*@C
1129    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1130    (the default parallel PETSc format). This matrix will ultimately be handled by
1131    Kokkos for calculations. For good matrix
1132    assembly performance the user should preallocate the matrix storage by setting
1133    the parameter nz (or the array nnz).  By setting these parameters accurately,
1134    performance during matrix assembly can be increased by more than a factor of 50.
1135 
1136    Collective
1137 
1138    Input Parameters:
1139 +  comm - MPI communicator, set to PETSC_COMM_SELF
1140 .  m - number of rows
1141 .  n - number of columns
1142 .  nz - number of nonzeros per row (same for all rows)
1143 -  nnz - array containing the number of nonzeros in the various rows
1144          (possibly different for each row) or NULL
1145 
1146    Output Parameter:
1147 .  A - the matrix
1148 
1149    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1150    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1151    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1152 
1153    Notes:
1154    If nnz is given then nz is ignored
1155 
1156    The AIJ format (also called the Yale sparse matrix format or
1157    compressed row storage), is fully compatible with standard Fortran 77
1158    storage.  That is, the stored row and column indices can begin at
1159    either one (as in Fortran) or zero.  See the users' manual for details.
1160 
1161    Specify the preallocated storage with either nz or nnz (not both).
1162    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1163    allocation.  For large problems you MUST preallocate memory or you
1164    will get TERRIBLE performance, see the users' manual chapter on matrices.
1165 
1166    By default, this format uses inodes (identical nodes) when possible, to
1167    improve numerical efficiency of matrix-vector products and solves. We
1168    search for consecutive rows with the same nonzero structure, thereby
1169    reusing matrix information to achieve increased efficiency.
1170 
1171    Level: intermediate
1172 
1173 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1174 @*/
1175 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1176 {
1177   PetscErrorCode ierr;
1178 
1179   PetscFunctionBegin;
1180   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1181   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1182   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1183   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1184   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 typedef Kokkos::TeamPolicy<>::member_type team_member;
1189 //
1190 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1191 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1192 //
1193 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1194 {
1195   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1196   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1197   IS                 isrow = b->row,isicol = b->icol;
1198   PetscErrorCode     ierr;
1199   const PetscInt     *r_h,*ic_h;
1200   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1201   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1202   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1203   PetscBool          row_identity,col_identity;
1204   PetscInt           nc, Nf, nVec=32; // should be a parameter
1205   PetscContainer     container;
1206 
1207   PetscFunctionBegin;
1208   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
1209   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1210   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1211   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1212   if (container) {
1213     PetscInt *pNf=NULL, nv;
1214     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1215     Nf = (*pNf)%1000;
1216     nv = (*pNf)/1000;
1217     if (nv>0) nVec = nv;
1218   } else Nf = 1;
1219   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
1220   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1221   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1222   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1223 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225 #endif
1226   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1227   {
1228 #define KOKKOS_SHARED_LEVEL 1
1229     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1230     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1231     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1232     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1233     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1234     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1235     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1236     size_t flops_h = 0.0;
1237     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1238     Kokkos::View<size_t> d_flops_k ("flops");
1239     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1240     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1241     Kokkos::deep_copy (d_flops_k, h_flops_k);
1242     Kokkos::deep_copy (d_r_k, h_r_k);
1243     Kokkos::deep_copy (d_ic_k, h_ic_k);
1244     // Fill A --> fact
1245     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1246         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1247         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1248         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1249         // zero rows of B
1250         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1251             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1252             PetscScalar *baL = ba_d + bi_d[rowb];
1253             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1254             /* zero (unfactored row) */
1255             for (int j=0;j<nzbL;j++) baL[j] = 0;
1256             for (int j=0;j<nzbU;j++) baU[j] = 0;
1257           });
1258         // copy A into B
1259         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1260             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1261             const PetscScalar *av    = aa_d + ai_d[rowa];
1262             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1263             /* load in initial (unfactored row) */
1264             for (int j=0;j<nza;j++) {
1265               PetscInt    colb = ic[ajtmp[j]];
1266               PetscScalar vala = av[j];
1267               if (colb == rowb) {
1268                 *(ba_d + bdiag_d[rowb]) = vala;
1269               } else {
1270                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1271                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1272                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1273                 for (int j=0; j<nz ; j++) {
1274                   if (pbj[j] == colb) {
1275                     pba[j] = vala;
1276                     set++;
1277                     break;
1278                   }
1279                 }
1280                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1281               }
1282             }
1283           });
1284       });
1285     Kokkos::fence();
1286 
1287     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1288         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1289         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1290         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1291         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1292         const PetscInt  start = field*nloc, end = start + nloc;
1293         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1294         // A22 panel update for each row A(1,:) and col A(:,1)
1295         for (int ii=start; ii<end-1; ii++) {
1296           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1297           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1298           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1299           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1300           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1301               PetscInt kIdx = j*Ni + field_block_idx;
1302               if (kIdx >= nzUi) /* void */ ;
1303               else {
1304                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1305                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1306                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1307                 size_t         st_idx;
1308                 // find and do L(k,i) = A(:k,i) / A(i,i)
1309                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1310                 // get column, there has got to be a better way
1311                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1312                     if (pjL[j] == ii) {
1313                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1314                       idx = j; // output
1315                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1316                     }
1317                 }, st_idx);
1318                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1319 #if defined(PETSC_USE_DEBUG)
1320                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1321 #endif
1322                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1323                 // U(i+1,:end)
1324                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1325                       PetscScalar Uij = baUi[uiIdx];
1326                       PetscInt    col = bjUi[uiIdx];
1327                       if (col==myk) {
1328                         // A_kk = A_kk - L_ki * U_ij(k)
1329                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1330                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1331                       } else {
1332                         PetscScalar    *start, *end, *pAkjv=NULL;
1333                         PetscInt       high, low;
1334                         const PetscInt *startj;
1335                         if (col<myk) { // L
1336                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1337                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1338                           start = pLki+1; // start at pLki+1, A22(myk,1)
1339                           startj= bj_d + bi_d[myk] + idx;
1340                           end   = ba_d + bi_d[myk+1];
1341                         } else {
1342                           PetscInt idx = bdiag_d[myk+1]+1;
1343                           start = ba_d + idx;
1344                           startj= bj_d + idx;
1345                           end   = ba_d + bdiag_d[myk];
1346                         }
1347                         // search for 'col', use bisection search - TODO
1348                         low  = 0;
1349                         high = (PetscInt)(end-start);
1350                         while (high-low > 5) {
1351                           int t = (low+high)/2;
1352                           if (startj[t] > col) high = t;
1353                           else                 low  = t;
1354                         }
1355                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1356                           if (startj[pAkjv-start] == col) break;
1357                         }
1358 #if defined(PETSC_USE_DEBUG)
1359                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1360 #endif
1361                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1362                       }
1363                     });
1364               }
1365             });
1366           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1367           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1368         } /* endof for (i=0; i<n; i++) { */
1369         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1370       });
1371     Kokkos::fence();
1372     Kokkos::deep_copy (h_flops_k, d_flops_k);
1373 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1374     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1375 #elif  defined(PETSC_USE_LOG)
1376     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1377 #endif
1378     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1379         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1380         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1381         /* Invert diagonal for simpler triangular solves */
1382         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1383             int i = start + outer_index*Ni + lg_rank%Ni;
1384             if (i < end) {
1385               PetscScalar *pv = ba_d + bdiag_d[i];
1386               *pv = 1.0/(*pv);
1387             }
1388           });
1389       });
1390   }
1391 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1392   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1393 #endif
1394   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1395   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1396 
1397   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1398   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1399   if (b->inode.size) {
1400     B->ops->solve = MatSolve_SeqAIJ_Inode;
1401   } else if (row_identity && col_identity) {
1402     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1403   } else {
1404     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1405   }
1406   B->offloadmask = PETSC_OFFLOAD_GPU;
1407   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1408   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1409   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1410   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1411   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1412   B->assembled              = PETSC_TRUE;
1413   B->preallocated           = PETSC_TRUE;
1414 
1415   PetscFunctionReturn(0);
1416 }
1417 
1418 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1419 {
1420   PetscErrorCode   ierr;
1421 
1422   PetscFunctionBegin;
1423   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1424   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1429 {
1430   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1431 
1432   PetscFunctionBegin;
1433   if (!factors->sptrsv_symbolic_completed) {
1434     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1435     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1436     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1437   }
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 /* Check if we need to update factors etc for transpose solve */
1442 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1443 {
1444   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1445   MatColIdxType             n = A->rmap->n;
1446 
1447   PetscFunctionBegin;
1448   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1449     /* Update L^T and do sptrsv symbolic */
1450     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1451     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1452     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1453     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1454 
1455     KokkosKernels::Impl::transpose_matrix<
1456       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1457       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1458       MatRowMapKokkosView,DefaultExecutionSpace>(
1459         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1460         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1461 
1462     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1463       We have to sort the indices, until KK provides finer control options.
1464     */
1465     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1466       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1467         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1468 
1469     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1470 
1471     /* Update U^T and do sptrsv symbolic */
1472     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1473     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1474     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1475     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1476 
1477     KokkosKernels::Impl::transpose_matrix<
1478       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1479       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1480       MatRowMapKokkosView,DefaultExecutionSpace>(
1481         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1482         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1483 
1484     /* Sort indices. See comments above */
1485     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1486       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1487         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1488 
1489     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1490     factors->transpose_updated = PETSC_TRUE;
1491   }
1492   PetscFunctionReturn(0);
1493 }
1494 
1495 /* Solve Ax = b, with A = LU */
1496 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1497 {
1498   PetscErrorCode                 ierr;
1499   ConstPetscScalarKokkosView     bv;
1500   PetscScalarKokkosView          xv;
1501   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1502 
1503   PetscFunctionBegin;
1504   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1505   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1506   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1507   /* Solve L tmpv = b */
1508   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1509   /* Solve Ux = tmpv */
1510   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1511   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1512   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1513   PetscFunctionReturn(0);
1514 }
1515 
1516 /* Solve A^T x = b, where A^T = U^T L^T */
1517 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1518 {
1519   PetscErrorCode                 ierr;
1520   ConstPetscScalarKokkosView     bv;
1521   PetscScalarKokkosView          xv;
1522   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1523 
1524   PetscFunctionBegin;
1525   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1526   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1527   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1528   /* Solve U^T tmpv = b */
1529   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1530 
1531   /* Solve L^T x = tmpv */
1532   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1533   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1534   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1539 {
1540   PetscErrorCode                 ierr;
1541   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1542   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1543   PetscInt                       fill_lev = info->levels;
1544 
1545   PetscFunctionBegin;
1546   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1547 
1548   auto a_d = aijkok->a_dual.view_device();
1549   auto i_d = aijkok->i_dual.view_device();
1550   auto j_d = aijkok->j_dual.view_device();
1551 
1552   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1553 
1554   B->assembled                       = PETSC_TRUE;
1555   B->preallocated                    = PETSC_TRUE;
1556   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1557   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1558   B->ops->matsolve                   = NULL;
1559   B->ops->matsolvetranspose          = NULL;
1560   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1561 
1562   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1563   factors->transpose_updated         = PETSC_FALSE;
1564   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1569 {
1570   PetscErrorCode                 ierr;
1571   Mat_SeqAIJKokkos               *aijkok;
1572   Mat_SeqAIJ                     *b;
1573   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1574   PetscInt                       fill_lev = info->levels;
1575   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1576   PetscInt                       n = A->rmap->n;
1577 
1578   PetscFunctionBegin;
1579   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1580   /* Rebuild factors */
1581   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1582   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1583 
1584   /* Create a spiluk handle and then do symbolic factorization */
1585   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1586   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1587 
1588   auto spiluk_handle = factors->kh.get_spiluk_handle();
1589 
1590   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1591   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1592   Kokkos::realloc(factors->iU_d,n+1);
1593   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1594 
1595   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1596   auto i_d = aijkok->i_dual.view_device();
1597   auto j_d = aijkok->j_dual.view_device();
1598   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1599   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1600 
1601   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1602   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1603   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1604   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1605 
1606   /* TODO: add options to select sptrsv algorithms */
1607   /* Create sptrsv handles for L, U and their transpose */
1608  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1609   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1610  #else
1611   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1612  #endif
1613 
1614   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1615   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1616   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1617   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1618 
1619   /* Fill fields of the factor matrix B */
1620   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1621   b     = (Mat_SeqAIJ*)B->data;
1622   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1623   B->info.fill_ratio_given  = info->fill;
1624   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1625 
1626   B->offloadmask            = PETSC_OFFLOAD_GPU;
1627   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1632 {
1633   PetscErrorCode   ierr;
1634   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1635   const PetscInt   nrows   = A->rmap->n;
1636 
1637   PetscFunctionBegin;
1638   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1639   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1640   // move B data into Kokkos
1641   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1642   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1643   {
1644     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1645     if (!baijkok->diag_d) {
1646       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1647       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1648       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1649     }
1650   }
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1655 {
1656   PetscFunctionBegin;
1657   *type = MATSOLVERKOKKOS;
1658   PetscFunctionReturn(0);
1659 }
1660 
1661 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1662 {
1663   PetscFunctionBegin;
1664   *type = MATSOLVERKOKKOSDEVICE;
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 /*MC
1669   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1670   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1671 
1672   Level: beginner
1673 
1674 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1675 M*/
1676 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1677 {
1678   PetscErrorCode ierr;
1679   PetscInt       n = A->rmap->n;
1680 
1681   PetscFunctionBegin;
1682   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1683   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1684   (*B)->factortype = ftype;
1685   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1686   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1687 
1688   if (ftype == MAT_FACTOR_LU) {
1689     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1690     (*B)->canuseordering         = PETSC_TRUE;
1691     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1692   } else if (ftype == MAT_FACTOR_ILU) {
1693     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1694     (*B)->canuseordering         = PETSC_FALSE;
1695     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1696   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1697 
1698   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1699   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1704 {
1705   PetscErrorCode ierr;
1706   PetscInt       n = A->rmap->n;
1707 
1708   PetscFunctionBegin;
1709   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1710   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1711   (*B)->factortype = ftype;
1712   (*B)->canuseordering = PETSC_TRUE;
1713   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1714   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1715 
1716   if (ftype == MAT_FACTOR_LU) {
1717     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1718     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1719   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1720 
1721   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1722   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1727 {
1728   PetscErrorCode ierr;
1729 
1730   PetscFunctionBegin;
1731   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1732   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1733   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 /* Utility to print out a KokkosCsrMatrix for debugging */
1738 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1739 {
1740   PetscErrorCode    ierr;
1741   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1742   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1743   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1744   const PetscInt    *i = iv.data();
1745   const PetscInt    *j = jv.data();
1746   const PetscScalar *a = av.data();
1747   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1748 
1749   PetscFunctionBegin;
1750   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1751   for (PetscInt k=0; k<m; k++) {
1752     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1753     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1754       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1755     }
1756     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1757   }
1758   PetscFunctionReturn(0);
1759 }
1760