xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 46804e07f02eced2c7dbc569d3d3a2c436994381)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
254   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* y = A^T x */
265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
266 {
267   PetscErrorCode                   ierr;
268   Mat_SeqAIJKokkos                 *aijkok;
269   const char                       *mode;
270   ConstPetscScalarKokkosView       xv;
271   PetscScalarKokkosView            yv;
272   KokkosCsrMatrix                  *csrmat;
273 
274   PetscFunctionBegin;
275   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
276   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
279   if (A->form_explicit_transpose) {
280     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281     mode = "N";
282   } else {
283     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284     csrmat = &aijkok->csrmat;
285     mode = "T";
286   }
287   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
290   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /* y = A^H x */
296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
297 {
298   PetscErrorCode                   ierr;
299   Mat_SeqAIJKokkos                 *aijkok;
300   const char                       *mode;
301   ConstPetscScalarKokkosView       xv;
302   PetscScalarKokkosView            yv;
303   KokkosCsrMatrix                  *csrmat;
304 
305   PetscFunctionBegin;
306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
307   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
310   if (A->form_explicit_transpose) {
311     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312     mode = "N";
313   } else {
314     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315     csrmat = &aijkok->csrmat;
316     mode = "C";
317   }
318   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
321   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 /* z = A x + y */
327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
328 {
329   PetscErrorCode                   ierr;
330   Mat_SeqAIJKokkos                 *aijkok;
331   ConstPetscScalarKokkosView       xv,yv;
332   PetscScalarKokkosView            zv;
333 
334   PetscFunctionBegin;
335   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
336   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
340   if (zz != yy) Kokkos::deep_copy(zv,yv);
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A^T x + y */
352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
353 {
354   PetscErrorCode                   ierr;
355   Mat_SeqAIJKokkos                 *aijkok;
356   const char                       *mode;
357   ConstPetscScalarKokkosView       xv,yv;
358   PetscScalarKokkosView            zv;
359   KokkosCsrMatrix                  *csrmat;
360 
361   PetscFunctionBegin;
362   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
363   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
367   if (zz != yy) Kokkos::deep_copy(zv,yv);
368   if (A->form_explicit_transpose) {
369     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370     mode = "N";
371   } else {
372     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373     csrmat = &aijkok->csrmat;
374     mode = "T";
375   }
376   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
380   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 /* z = A^H x + y */
386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
387 {
388   PetscErrorCode                   ierr;
389   Mat_SeqAIJKokkos                 *aijkok;
390   const char                       *mode;
391   ConstPetscScalarKokkosView       xv,yv;
392   PetscScalarKokkosView            zv;
393   KokkosCsrMatrix                  *csrmat;
394 
395   PetscFunctionBegin;
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
401   if (zz != yy) Kokkos::deep_copy(zv,yv);
402   if (A->form_explicit_transpose) {
403     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407     csrmat = &aijkok->csrmat;
408     mode = "C";
409   }
410   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
414   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420 {
421   PetscErrorCode            ierr;
422   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423 
424   PetscFunctionBegin;
425   switch (op) {
426     case MAT_FORM_EXPLICIT_TRANSPOSE:
427       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429       A->form_explicit_transpose = flg;
430       break;
431     default:
432       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433       break;
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 /* Depending on reuse, either build a new mat, or use the existing mat */
439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
440 {
441   PetscErrorCode   ierr;
442   Mat_SeqAIJ       *aseq;
443 
444   PetscFunctionBegin;
445   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
448   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
458       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
460     }
461   }
462   PetscFunctionReturn(0);
463 }
464 
465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467  */
468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
469 {
470   PetscErrorCode        ierr;
471   Mat_SeqAIJ            *bseq;
472   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473   Mat                   mat;
474 
475   PetscFunctionBegin;
476   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478   mat  = *B;
479   if (A->assembled) {
480     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483     /* Now copy values to B if needed */
484     if (dupOption == MAT_COPY_VALUES) {
485       if (akok->a_dual.need_sync_device()) {
486         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487         bkok->a_dual.modify_host();
488       } else { /* If device has the latest data, we only copy data on device */
489         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490         bkok->a_dual.modify_device();
491       }
492     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494       bkok->a_dual.modify_host();
495     }
496     mat->spptr = bkok;
497   }
498 
499   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
507 {
508   PetscErrorCode    ierr;
509   Mat               At;
510   KokkosCsrMatrix   *internT,*csrmatT;
511   Mat_SeqAIJKokkos  *atkok,*bkok;
512 
513   PetscFunctionBegin;
514   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
515   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
517     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
518     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
519     if (reuse == MAT_INITIAL_MATRIX) *B = At;
520     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
521   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
522     if ((*B)->assembled) {
523       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
524       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
525       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
526     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
527       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
528       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
529       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
530       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
531       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
532     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
538 {
539   PetscErrorCode             ierr;
540   Mat_SeqAIJKokkos           *aijkok;
541 
542   PetscFunctionBegin;
543   if (A->factortype == MAT_FACTOR_NONE) {
544     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545     if (aijkok) {
546       if (aijkok->device_mat_d.data()) {
547         delete aijkok->colmap_d;
548         delete aijkok->i_uncompressed_d;
549       }
550       if (aijkok->diag_d) delete aijkok->diag_d;
551     }
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
557   A->spptr = NULL;
558   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 
562 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
563 {
564   PetscErrorCode ierr;
565 
566   PetscFunctionBegin;
567   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
568   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
569   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
570   PetscFunctionReturn(0);
571 }
572 
573 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
574 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
575 {
576   PetscErrorCode               ierr;
577   Mat_SeqAIJ                   *a,*b;
578   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
579   MatScalarKokkosView          aa,ba,ca;
580   MatRowMapKokkosView          ai,bi,ci;
581   MatColIdxKokkosView          aj,bj,cj;
582   PetscInt                     m,n,nnz,aN;
583 
584   PetscFunctionBegin;
585   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
586   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
587   PetscValidPointer(C,4);
588   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
589   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
590   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
591   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
592 
593   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
594   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
595   a    = static_cast<Mat_SeqAIJ*>(A->data);
596   b    = static_cast<Mat_SeqAIJ*>(B->data);
597   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
599   aa   = akok->a_dual.view_device();
600   ai   = akok->i_dual.view_device();
601   ba   = bkok->a_dual.view_device();
602   bi   = bkok->i_dual.view_device();
603   m    = A->rmap->n; /* M, N and nnz of C */
604   n    = A->cmap->n + B->cmap->n;
605   nnz  = a->nz + b->nz;
606   aN   = A->cmap->n; /* N of A */
607   if (reuse == MAT_INITIAL_MATRIX) {
608     aj = akok->j_dual.view_device();
609     bj = bkok->j_dual.view_device();
610     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
611     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
612     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
613     ca = ca_dual.view_device();
614     ci = ci_dual.view_device();
615     cj = cj_dual.view_device();
616 
617     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
618     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
619       PetscInt i = t.league_rank(); /* row i */
620       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
621 
622       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
623         ci(i) = coffset;
624         if (i == m-1) ci(m) = ai(m) + bi(m);
625       });
626 
627       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
628         if (k < alen) {
629           ca(coffset+k) = aa(ai(i)+k);
630           cj(coffset+k) = aj(ai(i)+k);
631         } else {
632           ca(coffset+k) = ba(bi(i)+k-alen);
633           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
634         }
635       });
636     });
637     ca_dual.modify_device();
638     ci_dual.modify_device();
639     cj_dual.modify_device();
640     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
641     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
642   } else if (reuse == MAT_REUSE_MATRIX) {
643     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
644     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
645     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
646     ca   = ckok->a_dual.view_device();
647     ci   = ckok->i_dual.view_device();
648 
649     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
650       PetscInt i = t.league_rank(); /* row i */
651       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
652       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
653         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
654         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
655       });
656     });
657     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
658   }
659   PetscFunctionReturn(0);
660 }
661 
662 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
663 {
664   PetscFunctionBegin;
665   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
670 {
671   PetscErrorCode                 ierr;
672   Mat_Product                    *product = C->product;
673   Mat                            A,B;
674   bool                           transA,transB; /* use bool, since KK needs this type */
675   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
676   Mat_SeqAIJ                     *c;
677   MatProductData_SeqAIJKokkos    *pdata;
678   KokkosCsrMatrix                *csrmatA,*csrmatB;
679 
680   PetscFunctionBegin;
681   MatCheckProduct(C,1);
682   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
683   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
684 
685   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
686     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
687     PetscFunctionReturn(0);
688   }
689 
690   switch (product->type) {
691     case MATPRODUCT_AB:  transA = false; transB = false; break;
692     case MATPRODUCT_AtB: transA = true;  transB = false; break;
693     case MATPRODUCT_ABt: transA = false; transB = true;  break;
694     default:
695       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
696   }
697 
698   A     = product->A;
699   B     = product->B;
700   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
701   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
702   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
703   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
704   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
705 
706   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
707 
708   csrmatA = &akok->csrmat;
709   csrmatB = &bkok->csrmat;
710 
711   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
712   if (transA) {
713     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
714     transA = false;
715   }
716 
717   if (transB) {
718     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
719     transB = false;
720   }
721   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
722   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
723   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
724   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
725   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
726   /* shorter version of MatAssemblyEnd_SeqAIJ */
727   c = (Mat_SeqAIJ*)C->data;
728   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
729   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
730   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
731   c->reallocs         = 0;
732   C->info.mallocs     = 0;
733   C->info.nz_unneeded = 0;
734   C->assembled        = C->was_assembled = PETSC_TRUE;
735   C->num_ass++;
736   PetscFunctionReturn(0);
737 }
738 
739 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
740 {
741   PetscErrorCode                 ierr;
742   Mat_Product                    *product = C->product;
743   MatProductType                 ptype;
744   Mat                            A,B;
745   bool                           transA,transB;
746   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
747   MatProductData_SeqAIJKokkos    *pdata;
748   MPI_Comm                       comm;
749   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
750 
751   PetscFunctionBegin;
752   MatCheckProduct(C,1);
753   ierr = PetscObjectGetComm((PetscObject)C,&comm);
754   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
755   A       = product->A;
756   B       = product->B;
757   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
758   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
759   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
760   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
761   csrmatA = &akok->csrmat;
762   csrmatB = &bkok->csrmat;
763 
764   ptype   = product->type;
765   switch (ptype) {
766     case MATPRODUCT_AB:  transA = false; transB = false; break;
767     case MATPRODUCT_AtB: transA = true;  transB = false; break;
768     case MATPRODUCT_ABt: transA = false; transB = true;  break;
769     default:
770       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
771   }
772 
773   product->data = pdata = new MatProductData_SeqAIJKokkos();
774   pdata->kh.set_team_work_size(16);
775   pdata->kh.set_dynamic_scheduling(true);
776   pdata->reusesym = product->api_user;
777 
778   /* TODO: add command line options to select spgemm algorithms */
779   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
780   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
781      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
782    */
783   pdata->kh.create_spgemm_handle(spgemm_alg);
784 
785   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
786   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
787   if (transA) {
788     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
789     transA = false;
790   }
791 
792   if (transB) {
793     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
794     transB = false;
795   }
796 
797   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
798   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
799     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
800     calling new Mat_SeqAIJKokkos().
801     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
802   */
803   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
804   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
805   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
806 
807   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
808   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
809   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
810   PetscFunctionReturn(0);
811 }
812 
813 /* handles sparse matrix matrix ops */
814 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
815 {
816   PetscErrorCode ierr;
817   Mat_Product    *product = mat->product;
818   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
819 
820   PetscFunctionBegin;
821   MatCheckProduct(mat,1);
822   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
823   if (product->type == MATPRODUCT_ABC) {
824     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
825   }
826   if (Biskok && Ciskok) {
827     switch (product->type) {
828     case MATPRODUCT_AB:
829     case MATPRODUCT_AtB:
830     case MATPRODUCT_ABt:
831       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
832       break;
833     case MATPRODUCT_PtAP:
834     case MATPRODUCT_RARt:
835     case MATPRODUCT_ABC:
836       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
837       break;
838     default:
839       break;
840     }
841   } else { /* fallback for AIJ */
842     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
843   }
844   PetscFunctionReturn(0);
845 }
846 
847 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
848 {
849   PetscErrorCode   ierr;
850   Mat_SeqAIJKokkos *aijkok;
851 
852   PetscFunctionBegin;
853   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
854   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
855   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
856   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
857   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
858   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
859   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
864 {
865   PetscErrorCode   ierr;
866   Mat_SeqAIJKokkos *aijkok;
867 
868   PetscFunctionBegin;
869   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
870   if (aijkok) { /* Only zero the device if data is already there */
871     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
872     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
873   } else { /* Might be preallocated but not assembled */
874     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
875   }
876   PetscFunctionReturn(0);
877 }
878 
879 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
880 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
881 {
882   PetscErrorCode     ierr;
883   Mat_SeqAIJKokkos   *aijkok;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
890   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891   *kv    = aijkok->a_dual.view_device();
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
896 {
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
905 {
906   PetscErrorCode     ierr;
907   Mat_SeqAIJKokkos   *aijkok;
908 
909   PetscFunctionBegin;
910   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
911   PetscValidPointer(kv,2);
912   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
913   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
914   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
915   *kv    = aijkok->a_dual.view_device();
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
920 {
921   PetscErrorCode     ierr;
922 
923   PetscFunctionBegin;
924   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
925   PetscValidPointer(kv,2);
926   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
927   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
932 {
933   Mat_SeqAIJKokkos   *aijkok;
934 
935   PetscFunctionBegin;
936   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
937   PetscValidPointer(kv,2);
938   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
939   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
940   *kv    = aijkok->a_dual.view_device();
941   PetscFunctionReturn(0);
942 }
943 
944 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
945 {
946   PetscErrorCode     ierr;
947 
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
953   PetscFunctionReturn(0);
954 }
955 
956 /* Computes Y += alpha X */
957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958 {
959   PetscErrorCode             ierr;
960   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
961   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
962   ConstMatScalarKokkosView   Xa;
963   MatScalarKokkosView        Ya;
964 
965   PetscFunctionBegin;
966   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
967   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
968   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
969   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
970   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
971 
972   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
973     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
974     PetscBool e;
975     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
976     if (e) {
977       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
978       if (e) pattern = SAME_NONZERO_PATTERN;
979     }
980   }
981 
982   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
983     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
984     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
985     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
986   */
987   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
988   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
989   Xa   = xkok->a_dual.view_device();
990   Ya   = ykok->a_dual.view_device();
991 
992   if (pattern == SAME_NONZERO_PATTERN) {
993     KokkosBlas::axpy(alpha,Xa,Ya);
994     ierr = MatSeqAIJKokkosModifyDevice(Y);
995   } else if (pattern == SUBSET_NONZERO_PATTERN) {
996     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
997     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
998 
999     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1000       PetscInt i = t.league_rank(); /* row i */
1001       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1002         PetscInt p,q = Yi(i);
1003         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1004           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1005           if (Xj(p) == Yj(q)) { /* Found it */
1006             Ya(q) += alpha * Xa(p);
1007             q++;
1008           } else {
1009             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1010                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1011             */
1012             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1013           }
1014         }
1015       });
1016     });
1017     ierr = MatSeqAIJKokkosModifyDevice(Y);
1018   } else { /* different nonzero patterns */
1019     Mat             Z;
1020     KokkosCsrMatrix zcsr;
1021     KernelHandle    kh;
1022     kh.create_spadd_handle(false);
1023     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1024     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1025     zkok = new Mat_SeqAIJKokkos(zcsr);
1026     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1027     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1028     kh.destroy_spadd_handle();
1029   }
1030   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1031   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1036 {
1037   PetscErrorCode ierr;
1038 
1039   PetscFunctionBegin;
1040   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1041   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1042   B->offloadmask = PETSC_OFFLOAD_CPU;
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1047 {
1048   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1049 
1050   PetscFunctionBegin;
1051   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1052   A->boundtocpu  = PETSC_FALSE;
1053 
1054   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1055   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1056   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1057   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1058   A->ops->scale                     = MatScale_SeqAIJKokkos;
1059   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1060   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1061   A->ops->mult                      = MatMult_SeqAIJKokkos;
1062   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1063   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1064   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1065   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1066   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1067   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1068   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1069   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1070   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1071   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1072   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1073   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1074   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1075   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1080 {
1081   PetscErrorCode ierr;
1082   Mat_SeqAIJ     *aseq;
1083   PetscInt       i,m,n;
1084 
1085   PetscFunctionBegin;
1086   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1087 
1088   m    = akok->nrows();
1089   n    = akok->ncols();
1090   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1091   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1092 
1093   /* Set up data structures of A as a MATSEQAIJ */
1094   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1095   aseq = (Mat_SeqAIJ*)(A)->data;
1096 
1097   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1098   akok->j_dual.sync_host();
1099 
1100   aseq->i            = akok->i_host_data();
1101   aseq->j            = akok->j_host_data();
1102   aseq->a            = akok->a_host_data();
1103   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1104   aseq->singlemalloc = PETSC_FALSE;
1105   aseq->free_a       = PETSC_FALSE;
1106   aseq->free_ij      = PETSC_FALSE;
1107   aseq->nz           = akok->nnz();
1108   aseq->maxnz        = aseq->nz;
1109 
1110   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1111   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1112   for (i=0; i<m; i++) {
1113     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1114   }
1115 
1116   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1117   akok->nonzerostate = A->nonzerostate;
1118   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1119   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1120   A->spptr = akok;
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1125 
1126    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1127  */
1128 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1129 {
1130   PetscErrorCode ierr;
1131 
1132   PetscFunctionBegin;
1133   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1134   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 /* --------------------------------------------------------------------------------*/
1139 /*@C
1140    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1141    (the default parallel PETSc format). This matrix will ultimately be handled by
1142    Kokkos for calculations. For good matrix
1143    assembly performance the user should preallocate the matrix storage by setting
1144    the parameter nz (or the array nnz).  By setting these parameters accurately,
1145    performance during matrix assembly can be increased by more than a factor of 50.
1146 
1147    Collective
1148 
1149    Input Parameters:
1150 +  comm - MPI communicator, set to PETSC_COMM_SELF
1151 .  m - number of rows
1152 .  n - number of columns
1153 .  nz - number of nonzeros per row (same for all rows)
1154 -  nnz - array containing the number of nonzeros in the various rows
1155          (possibly different for each row) or NULL
1156 
1157    Output Parameter:
1158 .  A - the matrix
1159 
1160    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1161    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1162    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1163 
1164    Notes:
1165    If nnz is given then nz is ignored
1166 
1167    The AIJ format (also called the Yale sparse matrix format or
1168    compressed row storage), is fully compatible with standard Fortran 77
1169    storage.  That is, the stored row and column indices can begin at
1170    either one (as in Fortran) or zero.  See the users' manual for details.
1171 
1172    Specify the preallocated storage with either nz or nnz (not both).
1173    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1174    allocation.  For large problems you MUST preallocate memory or you
1175    will get TERRIBLE performance, see the users' manual chapter on matrices.
1176 
1177    By default, this format uses inodes (identical nodes) when possible, to
1178    improve numerical efficiency of matrix-vector products and solves. We
1179    search for consecutive rows with the same nonzero structure, thereby
1180    reusing matrix information to achieve increased efficiency.
1181 
1182    Level: intermediate
1183 
1184 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1185 @*/
1186 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1187 {
1188   PetscErrorCode ierr;
1189 
1190   PetscFunctionBegin;
1191   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1192   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1193   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1194   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1195   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 typedef Kokkos::TeamPolicy<>::member_type team_member;
1200 //
1201 // This factorization exploits block diagonal matrices with "Nf" (not used).
1202 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1203 //
1204 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1205 {
1206   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1207   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1208   IS                 isrow = b->row,isicol = b->icol;
1209   PetscErrorCode     ierr;
1210   const PetscInt     *r_h,*ic_h;
1211   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1212   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1213   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1214   PetscBool          row_identity,col_identity;
1215   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1216 
1217   PetscFunctionBegin;
1218   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1219   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1220   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1221   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1222   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1223   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1226   {
1227 #define KOKKOS_SHARED_LEVEL 1
1228     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1229     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1230     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1231     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1232     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1233     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1234     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1235     size_t flops_h = 0.0;
1236     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1237     Kokkos::View<size_t> d_flops_k ("flops");
1238     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1239     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1240     Kokkos::deep_copy (d_flops_k, h_flops_k);
1241     Kokkos::deep_copy (d_r_k, h_r_k);
1242     Kokkos::deep_copy (d_ic_k, h_ic_k);
1243     // Fill A --> fact
1244     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1245         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1246         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1247         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1248         // zero rows of B
1249         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1250             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1251             PetscScalar *baL = ba_d + bi_d[rowb];
1252             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1253             /* zero (unfactored row) */
1254             for (int j=0;j<nzbL;j++) baL[j] = 0;
1255             for (int j=0;j<nzbU;j++) baU[j] = 0;
1256           });
1257         // copy A into B
1258         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1259             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1260             const PetscScalar *av    = aa_d + ai_d[rowa];
1261             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1262             /* load in initial (unfactored row) */
1263             for (int j=0;j<nza;j++) {
1264               PetscInt    colb = ic[ajtmp[j]];
1265               PetscScalar vala = av[j];
1266               if (colb == rowb) {
1267                 *(ba_d + bdiag_d[rowb]) = vala;
1268               } else {
1269                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1270                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1271                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1272                 for (int j=0; j<nz ; j++) {
1273                   if (pbj[j] == colb) {
1274                     pba[j] = vala;
1275                     set++;
1276                     break;
1277                   }
1278                 }
1279                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1280               }
1281             }
1282           });
1283       });
1284     Kokkos::fence();
1285 
1286     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1287         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1288         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1289         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1290         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1291         const PetscInt  start = field*nloc, end = start + nloc;
1292         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1293         // A22 panel update for each row A(1,:) and col A(:,1)
1294         for (int ii=start; ii<end-1; ii++) {
1295           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1296           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1297           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1298           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1299           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1300               PetscInt kIdx = j*Ni + field_block_idx;
1301               if (kIdx >= nzUi) /* void */ ;
1302               else {
1303                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1304                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1305                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1306                 size_t         st_idx;
1307                 // find and do L(k,i) = A(:k,i) / A(i,i)
1308                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1309                 // get column, there has got to be a better way
1310                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1311                     if (pjL[j] == ii) {
1312                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1313                       idx = j; // output
1314                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1315                     }
1316                 }, st_idx);
1317                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1318 #if defined(PETSC_USE_DEBUG)
1319                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1320 #endif
1321                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1322                 // U(i+1,:end)
1323                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1324                       PetscScalar Uij = baUi[uiIdx];
1325                       PetscInt    col = bjUi[uiIdx];
1326                       if (col==myk) {
1327                         // A_kk = A_kk - L_ki * U_ij(k)
1328                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1329                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1330                       } else {
1331                         PetscScalar    *start, *end, *pAkjv=NULL;
1332                         PetscInt       high, low;
1333                         const PetscInt *startj;
1334                         if (col<myk) { // L
1335                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1336                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1337                           start = pLki+1; // start at pLki+1, A22(myk,1)
1338                           startj= bj_d + bi_d[myk] + idx;
1339                           end   = ba_d + bi_d[myk+1];
1340                         } else {
1341                           PetscInt idx = bdiag_d[myk+1]+1;
1342                           start = ba_d + idx;
1343                           startj= bj_d + idx;
1344                           end   = ba_d + bdiag_d[myk];
1345                         }
1346                         // search for 'col', use bisection search - TODO
1347                         low  = 0;
1348                         high = (PetscInt)(end-start);
1349                         while (high-low > 5) {
1350                           int t = (low+high)/2;
1351                           if (startj[t] > col) high = t;
1352                           else                 low  = t;
1353                         }
1354                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1355                           if (startj[pAkjv-start] == col) break;
1356                         }
1357 #if defined(PETSC_USE_DEBUG)
1358                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1359 #endif
1360                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1361                       }
1362                     });
1363               }
1364             });
1365           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1366           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1367         } /* endof for (i=0; i<n; i++) { */
1368         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1369       });
1370     Kokkos::fence();
1371     Kokkos::deep_copy (h_flops_k, d_flops_k);
1372     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1373     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1374         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1375         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1376         /* Invert diagonal for simpler triangular solves */
1377         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1378             int i = start + outer_index*Ni + lg_rank%Ni;
1379             if (i < end) {
1380               PetscScalar *pv = ba_d + bdiag_d[i];
1381               *pv = 1.0/(*pv);
1382             }
1383           });
1384       });
1385   }
1386   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1387   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1388   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1389 
1390   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1391   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1392   if (b->inode.size) {
1393     B->ops->solve = MatSolve_SeqAIJ_Inode;
1394   } else if (row_identity && col_identity) {
1395     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1396   } else {
1397     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1398   }
1399   B->offloadmask = PETSC_OFFLOAD_GPU;
1400   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1401   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1402   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1403   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1404   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1405   B->assembled              = PETSC_TRUE;
1406   B->preallocated           = PETSC_TRUE;
1407 
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1412 {
1413   PetscErrorCode   ierr;
1414 
1415   PetscFunctionBegin;
1416   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1417   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1422 {
1423   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1424 
1425   PetscFunctionBegin;
1426   if (!factors->sptrsv_symbolic_completed) {
1427     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1428     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1429     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1430   }
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 /* Check if we need to update factors etc for transpose solve */
1435 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1436 {
1437   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1438   MatColIdxType             n = A->rmap->n;
1439 
1440   PetscFunctionBegin;
1441   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1442     /* Update L^T and do sptrsv symbolic */
1443     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1444     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1445     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1446     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1447 
1448     KokkosKernels::Impl::transpose_matrix<
1449       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1450       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1451       MatRowMapKokkosView,DefaultExecutionSpace>(
1452         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1453         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1454 
1455     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1456       We have to sort the indices, until KK provides finer control options.
1457     */
1458     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1459       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1460         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1461 
1462     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1463 
1464     /* Update U^T and do sptrsv symbolic */
1465     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1466     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1467     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1468     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1469 
1470     KokkosKernels::Impl::transpose_matrix<
1471       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1472       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1473       MatRowMapKokkosView,DefaultExecutionSpace>(
1474         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1475         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1476 
1477     /* Sort indices. See comments above */
1478     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1479       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1480         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1481 
1482     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1483     factors->transpose_updated = PETSC_TRUE;
1484   }
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 /* Solve Ax = b, with A = LU */
1489 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1490 {
1491   PetscErrorCode                 ierr;
1492   ConstPetscScalarKokkosView     bv;
1493   PetscScalarKokkosView          xv;
1494   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1495 
1496   PetscFunctionBegin;
1497   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1498   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1499   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1500   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1501   /* Solve L tmpv = b */
1502   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1503   /* Solve Ux = tmpv */
1504   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1505   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1506   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1507   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 /* Solve A^T x = b, where A^T = U^T L^T */
1512 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1513 {
1514   PetscErrorCode                 ierr;
1515   ConstPetscScalarKokkosView     bv;
1516   PetscScalarKokkosView          xv;
1517   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1518 
1519   PetscFunctionBegin;
1520   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1521   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1522   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1523   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1524   /* Solve U^T tmpv = b */
1525   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1526 
1527   /* Solve L^T x = tmpv */
1528   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1529   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1530   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1531   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1536 {
1537   PetscErrorCode                 ierr;
1538   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1539   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1540   PetscInt                       fill_lev = info->levels;
1541 
1542   PetscFunctionBegin;
1543   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1544   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1545 
1546   auto a_d = aijkok->a_dual.view_device();
1547   auto i_d = aijkok->i_dual.view_device();
1548   auto j_d = aijkok->j_dual.view_device();
1549 
1550   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1551 
1552   B->assembled                       = PETSC_TRUE;
1553   B->preallocated                    = PETSC_TRUE;
1554   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1555   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1556   B->ops->matsolve                   = NULL;
1557   B->ops->matsolvetranspose          = NULL;
1558   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1559 
1560   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1561   factors->transpose_updated         = PETSC_FALSE;
1562   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1563   /* TODO: log flops, but how to know that? */
1564   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1569 {
1570   PetscErrorCode                 ierr;
1571   Mat_SeqAIJKokkos               *aijkok;
1572   Mat_SeqAIJ                     *b;
1573   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1574   PetscInt                       fill_lev = info->levels;
1575   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1576   PetscInt                       n = A->rmap->n;
1577 
1578   PetscFunctionBegin;
1579   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1580   /* Rebuild factors */
1581   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1582   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1583 
1584   /* Create a spiluk handle and then do symbolic factorization */
1585   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1586   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1587 
1588   auto spiluk_handle = factors->kh.get_spiluk_handle();
1589 
1590   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1591   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1592   Kokkos::realloc(factors->iU_d,n+1);
1593   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1594 
1595   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1596   auto i_d = aijkok->i_dual.view_device();
1597   auto j_d = aijkok->j_dual.view_device();
1598   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1599   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1600 
1601   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1602   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1603   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1604   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1605 
1606   /* TODO: add options to select sptrsv algorithms */
1607   /* Create sptrsv handles for L, U and their transpose */
1608  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1609   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1610  #else
1611   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1612  #endif
1613 
1614   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1615   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1616   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1617   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1618 
1619   /* Fill fields of the factor matrix B */
1620   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1621   b     = (Mat_SeqAIJ*)B->data;
1622   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1623   B->info.fill_ratio_given  = info->fill;
1624   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1625 
1626   B->offloadmask            = PETSC_OFFLOAD_GPU;
1627   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1632 {
1633   PetscErrorCode   ierr;
1634   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1635   const PetscInt   nrows   = A->rmap->n;
1636 
1637   PetscFunctionBegin;
1638   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1639   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1640   // move B data into Kokkos
1641   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1642   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1643   {
1644     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1645     if (!baijkok->diag_d) {
1646       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1647       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1648       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1649     }
1650   }
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1655 {
1656   PetscFunctionBegin;
1657   *type = MATSOLVERKOKKOS;
1658   PetscFunctionReturn(0);
1659 }
1660 
1661 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1662 {
1663   PetscFunctionBegin;
1664   *type = MATSOLVERKOKKOSDEVICE;
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 /*MC
1669   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1670   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1671 
1672   Level: beginner
1673 
1674 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1675 M*/
1676 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1677 {
1678   PetscErrorCode ierr;
1679   PetscInt       n = A->rmap->n;
1680 
1681   PetscFunctionBegin;
1682   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1683   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1684   (*B)->factortype = ftype;
1685   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1686   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1687 
1688   if (ftype == MAT_FACTOR_LU) {
1689     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1690     (*B)->canuseordering         = PETSC_TRUE;
1691     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1692   } else if (ftype == MAT_FACTOR_ILU) {
1693     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1694     (*B)->canuseordering         = PETSC_FALSE;
1695     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1696   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1697 
1698   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1699   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1704 {
1705   PetscErrorCode ierr;
1706   PetscInt       n = A->rmap->n;
1707 
1708   PetscFunctionBegin;
1709   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1710   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1711   (*B)->factortype = ftype;
1712   (*B)->canuseordering = PETSC_TRUE;
1713   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1714   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1715 
1716   if (ftype == MAT_FACTOR_LU) {
1717     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1718     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1719   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1720 
1721   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1722   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1727 {
1728   PetscErrorCode ierr;
1729 
1730   PetscFunctionBegin;
1731   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1732   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1733   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 /* Utility to print out a KokkosCsrMatrix for debugging */
1738 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1739 {
1740   PetscErrorCode    ierr;
1741   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1742   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1743   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1744   const PetscInt    *i = iv.data();
1745   const PetscInt    *j = jv.data();
1746   const PetscScalar *a = av.data();
1747   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1748 
1749   PetscFunctionBegin;
1750   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1751   for (PetscInt k=0; k<m; k++) {
1752     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1753     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1754       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1755     }
1756     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1757   }
1758   PetscFunctionReturn(0);
1759 }
1760