xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 9ef6dbed0bd34d94bb97f48aaa9d11d48a2bc3a9)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
254   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* y = A^T x */
265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
266 {
267   PetscErrorCode                   ierr;
268   Mat_SeqAIJKokkos                 *aijkok;
269   const char                       *mode;
270   ConstPetscScalarKokkosView       xv;
271   PetscScalarKokkosView            yv;
272   KokkosCsrMatrix                  *csrmat;
273 
274   PetscFunctionBegin;
275   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
276   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
279   if (A->form_explicit_transpose) {
280     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281     mode = "N";
282   } else {
283     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284     csrmat = &aijkok->csrmat;
285     mode = "T";
286   }
287   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
290   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /* y = A^H x */
296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
297 {
298   PetscErrorCode                   ierr;
299   Mat_SeqAIJKokkos                 *aijkok;
300   const char                       *mode;
301   ConstPetscScalarKokkosView       xv;
302   PetscScalarKokkosView            yv;
303   KokkosCsrMatrix                  *csrmat;
304 
305   PetscFunctionBegin;
306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
307   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
310   if (A->form_explicit_transpose) {
311     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312     mode = "N";
313   } else {
314     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315     csrmat = &aijkok->csrmat;
316     mode = "C";
317   }
318   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
321   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 /* z = A x + y */
327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
328 {
329   PetscErrorCode                   ierr;
330   Mat_SeqAIJKokkos                 *aijkok;
331   ConstPetscScalarKokkosView       xv,yv;
332   PetscScalarKokkosView            zv;
333 
334   PetscFunctionBegin;
335   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
336   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
340   if (zz != yy) Kokkos::deep_copy(zv,yv);
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A^T x + y */
352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
353 {
354   PetscErrorCode                   ierr;
355   Mat_SeqAIJKokkos                 *aijkok;
356   const char                       *mode;
357   ConstPetscScalarKokkosView       xv,yv;
358   PetscScalarKokkosView            zv;
359   KokkosCsrMatrix                  *csrmat;
360 
361   PetscFunctionBegin;
362   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
363   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
367   if (zz != yy) Kokkos::deep_copy(zv,yv);
368   if (A->form_explicit_transpose) {
369     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370     mode = "N";
371   } else {
372     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373     csrmat = &aijkok->csrmat;
374     mode = "T";
375   }
376   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
380   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 /* z = A^H x + y */
386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
387 {
388   PetscErrorCode                   ierr;
389   Mat_SeqAIJKokkos                 *aijkok;
390   const char                       *mode;
391   ConstPetscScalarKokkosView       xv,yv;
392   PetscScalarKokkosView            zv;
393   KokkosCsrMatrix                  *csrmat;
394 
395   PetscFunctionBegin;
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
401   if (zz != yy) Kokkos::deep_copy(zv,yv);
402   if (A->form_explicit_transpose) {
403     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407     csrmat = &aijkok->csrmat;
408     mode = "C";
409   }
410   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
414   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420 {
421   PetscErrorCode            ierr;
422   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423 
424   PetscFunctionBegin;
425   switch (op) {
426     case MAT_FORM_EXPLICIT_TRANSPOSE:
427       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429       A->form_explicit_transpose = flg;
430       break;
431     default:
432       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433       break;
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 /* Depending on reuse, either build a new mat, or use the existing mat */
439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
440 {
441   PetscErrorCode   ierr;
442   Mat_SeqAIJ       *aseq;
443 
444   PetscFunctionBegin;
445   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
448   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
458       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
460     }
461   }
462   PetscFunctionReturn(0);
463 }
464 
465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467  */
468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
469 {
470   PetscErrorCode        ierr;
471   Mat_SeqAIJ            *bseq;
472   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473   Mat                   mat;
474 
475   PetscFunctionBegin;
476   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478   mat  = *B;
479   if (A->assembled) {
480     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483     /* Now copy values to B if needed */
484     if (dupOption == MAT_COPY_VALUES) {
485       if (akok->a_dual.need_sync_device()) {
486         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487         bkok->a_dual.modify_host();
488       } else { /* If device has the latest data, we only copy data on device */
489         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490         bkok->a_dual.modify_device();
491       }
492     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494       bkok->a_dual.modify_host();
495     }
496     mat->spptr = bkok;
497   }
498 
499   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
507 {
508   PetscErrorCode    ierr;
509   Mat               At;
510   KokkosCsrMatrix   *internT,*csrmatT;
511   Mat_SeqAIJKokkos  *atkok,*bkok;
512 
513   PetscFunctionBegin;
514   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
515   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
517     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
518     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
519     if (reuse == MAT_INITIAL_MATRIX) *B = At;
520     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
521   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
522     if ((*B)->assembled) {
523       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
524       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
525       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
526     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
527       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
528       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
529       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
530       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
531       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
532     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
538 {
539   PetscErrorCode             ierr;
540   Mat_SeqAIJKokkos           *aijkok;
541 
542   PetscFunctionBegin;
543   if (A->factortype == MAT_FACTOR_NONE) {
544     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545     if (aijkok) {
546       if (aijkok->device_mat_d.data()) {
547         delete aijkok->colmap_d;
548         delete aijkok->i_uncompressed_d;
549       }
550       if (aijkok->diag_d) delete aijkok->diag_d;
551     }
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
557   A->spptr = NULL;
558   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
559   PetscFunctionReturn(0);
560 }
561 
562 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
563 {
564   PetscErrorCode ierr;
565 
566   PetscFunctionBegin;
567   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
568   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
569   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
570   PetscFunctionReturn(0);
571 }
572 
573 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
574 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
575 {
576   PetscErrorCode               ierr;
577   Mat_SeqAIJ                   *a,*b;
578   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
579   MatScalarKokkosView          aa,ba,ca;
580   MatRowMapKokkosView          ai,bi,ci;
581   MatColIdxKokkosView          aj,bj,cj;
582   PetscInt                     m,n,nnz,aN;
583 
584   PetscFunctionBegin;
585   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
586   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
587   PetscValidPointer(C,4);
588   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
589   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
590   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
591   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
592 
593   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
594   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
595   a    = static_cast<Mat_SeqAIJ*>(A->data);
596   b    = static_cast<Mat_SeqAIJ*>(B->data);
597   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
599   aa   = akok->a_dual.view_device();
600   ai   = akok->i_dual.view_device();
601   ba   = bkok->a_dual.view_device();
602   bi   = bkok->i_dual.view_device();
603   m    = A->rmap->n; /* M, N and nnz of C */
604   n    = A->cmap->n + B->cmap->n;
605   nnz  = a->nz + b->nz;
606   aN   = A->cmap->n; /* N of A */
607   if (reuse == MAT_INITIAL_MATRIX) {
608     aj = akok->j_dual.view_device();
609     bj = bkok->j_dual.view_device();
610     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
611     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
612     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
613     ca = ca_dual.view_device();
614     ci = ci_dual.view_device();
615     cj = cj_dual.view_device();
616 
617     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
618     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
619       PetscInt i = t.league_rank(); /* row i */
620       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
621 
622       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
623         ci(i) = coffset;
624         if (i == m-1) ci(m) = ai(m) + bi(m);
625       });
626 
627       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
628         if (k < alen) {
629           ca(coffset+k) = aa(ai(i)+k);
630           cj(coffset+k) = aj(ai(i)+k);
631         } else {
632           ca(coffset+k) = ba(bi(i)+k-alen);
633           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
634         }
635       });
636     });
637     ca_dual.modify_device();
638     ci_dual.modify_device();
639     cj_dual.modify_device();
640     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
641     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
642   } else if (reuse == MAT_REUSE_MATRIX) {
643     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
644     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
645     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
646     ca   = ckok->a_dual.view_device();
647     ci   = ckok->i_dual.view_device();
648 
649     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
650       PetscInt i = t.league_rank(); /* row i */
651       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
652       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
653         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
654         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
655       });
656     });
657     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
658   }
659   PetscFunctionReturn(0);
660 }
661 
662 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
663 {
664   PetscFunctionBegin;
665   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
670 {
671   PetscErrorCode                 ierr;
672   Mat_Product                    *product = C->product;
673   Mat                            A,B;
674   bool                           transA,transB; /* use bool, since KK needs this type */
675   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
676   Mat_SeqAIJ                     *c;
677   MatProductData_SeqAIJKokkos    *pdata;
678   KokkosCsrMatrix                *csrmatA,*csrmatB;
679 
680   PetscFunctionBegin;
681   MatCheckProduct(C,1);
682   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
683   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
684 
685   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
686     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
687     PetscFunctionReturn(0);
688   }
689 
690   switch (product->type) {
691     case MATPRODUCT_AB:  transA = false; transB = false; break;
692     case MATPRODUCT_AtB: transA = true;  transB = false; break;
693     case MATPRODUCT_ABt: transA = false; transB = true;  break;
694     default:
695       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
696   }
697 
698   A     = product->A;
699   B     = product->B;
700   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
701   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
702   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
703   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
704   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
705 
706   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
707 
708   csrmatA = &akok->csrmat;
709   csrmatB = &bkok->csrmat;
710 
711   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
712   if (transA) {
713     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
714     transA = false;
715   }
716 
717   if (transB) {
718     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
719     transB = false;
720   }
721   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
722   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
723   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
724   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
725   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
726   /* shorter version of MatAssemblyEnd_SeqAIJ */
727   c = (Mat_SeqAIJ*)C->data;
728   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
729   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
730   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
731   c->reallocs         = 0;
732   C->info.mallocs     = 0;
733   C->info.nz_unneeded = 0;
734   C->assembled        = C->was_assembled = PETSC_TRUE;
735   C->num_ass++;
736   PetscFunctionReturn(0);
737 }
738 
739 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
740 {
741   PetscErrorCode                 ierr;
742   Mat_Product                    *product = C->product;
743   MatProductType                 ptype;
744   Mat                            A,B;
745   bool                           transA,transB;
746   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
747   MatProductData_SeqAIJKokkos    *pdata;
748   MPI_Comm                       comm;
749   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
750 
751   PetscFunctionBegin;
752   MatCheckProduct(C,1);
753   ierr = PetscObjectGetComm((PetscObject)C,&comm);
754   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
755   A       = product->A;
756   B       = product->B;
757   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
758   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
759   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
760   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
761   csrmatA = &akok->csrmat;
762   csrmatB = &bkok->csrmat;
763 
764   ptype   = product->type;
765   switch (ptype) {
766     case MATPRODUCT_AB:  transA = false; transB = false; break;
767     case MATPRODUCT_AtB: transA = true;  transB = false; break;
768     case MATPRODUCT_ABt: transA = false; transB = true;  break;
769     default:
770       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
771   }
772 
773   product->data = pdata = new MatProductData_SeqAIJKokkos();
774   pdata->kh.set_team_work_size(16);
775   pdata->kh.set_dynamic_scheduling(true);
776   pdata->reusesym = product->api_user;
777 
778   /* TODO: add command line options to select spgemm algorithms */
779   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
780   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
781      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
782    */
783   pdata->kh.create_spgemm_handle(spgemm_alg);
784 
785   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
786   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
787   if (transA) {
788     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
789     transA = false;
790   }
791 
792   if (transB) {
793     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
794     transB = false;
795   }
796 
797   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
798   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
799     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
800     calling new Mat_SeqAIJKokkos().
801     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
802   */
803   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
804   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
805   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
806 
807   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
808   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
809   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
810   PetscFunctionReturn(0);
811 }
812 
813 /* handles sparse matrix matrix ops */
814 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
815 {
816   PetscErrorCode ierr;
817   Mat_Product    *product = mat->product;
818   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
819 
820   PetscFunctionBegin;
821   MatCheckProduct(mat,1);
822   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
823   if (product->type == MATPRODUCT_ABC) {
824     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
825   }
826   if (Biskok && Ciskok) {
827     switch (product->type) {
828     case MATPRODUCT_AB:
829     case MATPRODUCT_AtB:
830     case MATPRODUCT_ABt:
831       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
832       break;
833     case MATPRODUCT_PtAP:
834     case MATPRODUCT_RARt:
835     case MATPRODUCT_ABC:
836       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
837       break;
838     default:
839       break;
840     }
841   } else { /* fallback for AIJ */
842     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
843   }
844   PetscFunctionReturn(0);
845 }
846 
847 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
848 {
849   PetscErrorCode   ierr;
850   Mat_SeqAIJKokkos *aijkok;
851 
852   PetscFunctionBegin;
853   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
854   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
855   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
856   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
857   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
858   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
859   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
860   PetscFunctionReturn(0);
861 }
862 
863 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
864 {
865   PetscErrorCode   ierr;
866   Mat_SeqAIJKokkos *aijkok;
867 
868   PetscFunctionBegin;
869   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
870   if (aijkok) { /* Only zero the device if data is already there */
871     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
872     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
873   } else { /* Might be preallocated but not assembled */
874     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
875   }
876   PetscFunctionReturn(0);
877 }
878 
879 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
880 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
881 {
882   PetscErrorCode     ierr;
883   Mat_SeqAIJKokkos   *aijkok;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
890   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891   *kv    = aijkok->a_dual.view_device();
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
896 {
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
905 {
906   PetscErrorCode     ierr;
907   Mat_SeqAIJKokkos   *aijkok;
908 
909   PetscFunctionBegin;
910   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
911   PetscValidPointer(kv,2);
912   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
913   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
914   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
915   *kv    = aijkok->a_dual.view_device();
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
920 {
921   PetscErrorCode     ierr;
922 
923   PetscFunctionBegin;
924   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
925   PetscValidPointer(kv,2);
926   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
927   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
932 {
933   Mat_SeqAIJKokkos   *aijkok;
934 
935   PetscFunctionBegin;
936   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
937   PetscValidPointer(kv,2);
938   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
939   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
940   *kv    = aijkok->a_dual.view_device();
941   PetscFunctionReturn(0);
942 }
943 
944 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
945 {
946   PetscErrorCode     ierr;
947 
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
953   PetscFunctionReturn(0);
954 }
955 
956 /* Computes Y += alpha X */
957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958 {
959   PetscErrorCode             ierr;
960   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
961   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
962   ConstMatScalarKokkosView   Xa;
963   MatScalarKokkosView        Ya;
964 
965   PetscFunctionBegin;
966   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
967   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
968   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
969   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
970   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
971 
972   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
973     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
974     PetscBool e;
975     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
976     if (e) {
977       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
978       if (e) pattern = SAME_NONZERO_PATTERN;
979     }
980   }
981 
982   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
983     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
984     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
985     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
986   */
987   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
988   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
989   Xa   = xkok->a_dual.view_device();
990   Ya   = ykok->a_dual.view_device();
991 
992   if (pattern == SAME_NONZERO_PATTERN) {
993     KokkosBlas::axpy(alpha,Xa,Ya);
994     ierr = MatSeqAIJKokkosModifyDevice(Y);
995   } else if (pattern == SUBSET_NONZERO_PATTERN) {
996     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
997     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
998 
999     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1000       PetscInt i = t.league_rank(); /* row i */
1001       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1002         PetscInt p,q = Yi(i);
1003         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1004           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1005           if (Xj(p) == Yj(q)) { /* Found it */
1006             Ya(q) += alpha * Xa(p);
1007             q++;
1008           } else {
1009             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1010                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1011             */
1012             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1013           }
1014         }
1015       });
1016     });
1017     ierr = MatSeqAIJKokkosModifyDevice(Y);
1018   } else { /* different nonzero patterns */
1019     Mat             Z;
1020     KokkosCsrMatrix zcsr;
1021     KernelHandle    kh;
1022     kh.create_spadd_handle(false);
1023     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1024     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1025     zkok = new Mat_SeqAIJKokkos(zcsr);
1026     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1027     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1028     kh.destroy_spadd_handle();
1029   }
1030   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1031   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1036 {
1037   PetscErrorCode ierr;
1038 
1039   PetscFunctionBegin;
1040   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1041   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1042   B->offloadmask = PETSC_OFFLOAD_CPU;
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1047 {
1048   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1049 
1050   PetscFunctionBegin;
1051   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1052   A->boundtocpu  = PETSC_FALSE;
1053 
1054   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1055   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1056   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1057   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1058   A->ops->scale                     = MatScale_SeqAIJKokkos;
1059   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1060   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1061   A->ops->mult                      = MatMult_SeqAIJKokkos;
1062   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1063   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1064   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1065   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1066   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1067   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1068   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1069   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1070   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1071   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1072   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1073   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1074   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1075   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1080 {
1081   PetscErrorCode ierr;
1082   Mat_SeqAIJ     *aseq;
1083   PetscInt       i,m,n;
1084 
1085   PetscFunctionBegin;
1086   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1087 
1088   m    = akok->nrows();
1089   n    = akok->ncols();
1090   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1091   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1092 
1093   /* Set up data structures of A as a MATSEQAIJ */
1094   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1095   aseq = (Mat_SeqAIJ*)(A)->data;
1096 
1097   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1098   akok->j_dual.sync_host();
1099 
1100   aseq->i            = akok->i_host_data();
1101   aseq->j            = akok->j_host_data();
1102   aseq->a            = akok->a_host_data();
1103   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1104   aseq->singlemalloc = PETSC_FALSE;
1105   aseq->free_a       = PETSC_FALSE;
1106   aseq->free_ij      = PETSC_FALSE;
1107   aseq->nz           = akok->nnz();
1108   aseq->maxnz        = aseq->nz;
1109 
1110   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1111   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1112   for (i=0; i<m; i++) {
1113     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1114   }
1115 
1116   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1117   akok->nonzerostate = A->nonzerostate;
1118   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1119   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1120   A->spptr = akok;
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1125 
1126    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1127  */
1128 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1129 {
1130   PetscErrorCode ierr;
1131 
1132   PetscFunctionBegin;
1133   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1134   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 /* --------------------------------------------------------------------------------*/
1139 /*@C
1140    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1141    (the default parallel PETSc format). This matrix will ultimately be handled by
1142    Kokkos for calculations. For good matrix
1143    assembly performance the user should preallocate the matrix storage by setting
1144    the parameter nz (or the array nnz).  By setting these parameters accurately,
1145    performance during matrix assembly can be increased by more than a factor of 50.
1146 
1147    Collective
1148 
1149    Input Parameters:
1150 +  comm - MPI communicator, set to PETSC_COMM_SELF
1151 .  m - number of rows
1152 .  n - number of columns
1153 .  nz - number of nonzeros per row (same for all rows)
1154 -  nnz - array containing the number of nonzeros in the various rows
1155          (possibly different for each row) or NULL
1156 
1157    Output Parameter:
1158 .  A - the matrix
1159 
1160    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1161    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1162    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1163 
1164    Notes:
1165    If nnz is given then nz is ignored
1166 
1167    The AIJ format (also called the Yale sparse matrix format or
1168    compressed row storage), is fully compatible with standard Fortran 77
1169    storage.  That is, the stored row and column indices can begin at
1170    either one (as in Fortran) or zero.  See the users' manual for details.
1171 
1172    Specify the preallocated storage with either nz or nnz (not both).
1173    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1174    allocation.  For large problems you MUST preallocate memory or you
1175    will get TERRIBLE performance, see the users' manual chapter on matrices.
1176 
1177    By default, this format uses inodes (identical nodes) when possible, to
1178    improve numerical efficiency of matrix-vector products and solves. We
1179    search for consecutive rows with the same nonzero structure, thereby
1180    reusing matrix information to achieve increased efficiency.
1181 
1182    Level: intermediate
1183 
1184 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1185 @*/
1186 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1187 {
1188   PetscErrorCode ierr;
1189 
1190   PetscFunctionBegin;
1191   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1192   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1193   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1194   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1195   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 typedef Kokkos::TeamPolicy<>::member_type team_member;
1200 //
1201 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1202 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1203 //
1204 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1205 {
1206   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1207   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1208   IS                 isrow = b->row,isicol = b->icol;
1209   PetscErrorCode     ierr;
1210   const PetscInt     *r_h,*ic_h;
1211   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1212   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1213   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1214   PetscBool          row_identity,col_identity;
1215   PetscInt           nc, Nf, nVec=32; // should be a parameter
1216   PetscContainer     container;
1217 
1218   PetscFunctionBegin;
1219   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1220   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1221   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1222   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1223   if (container) {
1224     PetscInt *pNf=NULL, nv;
1225     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1226     Nf = (*pNf)%1000;
1227     nv = (*pNf)/1000;
1228     if (nv>0) nVec = nv;
1229   } else Nf = 1;
1230   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
1231   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1232   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1233   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1234   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1235   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1236   {
1237 #define KOKKOS_SHARED_LEVEL 1
1238     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1239     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1240     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1241     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1242     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1243     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1244     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1245     size_t flops_h = 0.0;
1246     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1247     Kokkos::View<size_t> d_flops_k ("flops");
1248     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1249     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1250     Kokkos::deep_copy (d_flops_k, h_flops_k);
1251     Kokkos::deep_copy (d_r_k, h_r_k);
1252     Kokkos::deep_copy (d_ic_k, h_ic_k);
1253     // Fill A --> fact
1254     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1255         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1256         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1257         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1258         // zero rows of B
1259         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1260             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1261             PetscScalar *baL = ba_d + bi_d[rowb];
1262             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1263             /* zero (unfactored row) */
1264             for (int j=0;j<nzbL;j++) baL[j] = 0;
1265             for (int j=0;j<nzbU;j++) baU[j] = 0;
1266           });
1267         // copy A into B
1268         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1269             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1270             const PetscScalar *av    = aa_d + ai_d[rowa];
1271             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1272             /* load in initial (unfactored row) */
1273             for (int j=0;j<nza;j++) {
1274               PetscInt    colb = ic[ajtmp[j]];
1275               PetscScalar vala = av[j];
1276               if (colb == rowb) {
1277                 *(ba_d + bdiag_d[rowb]) = vala;
1278               } else {
1279                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1280                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1281                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1282                 for (int j=0; j<nz ; j++) {
1283                   if (pbj[j] == colb) {
1284                     pba[j] = vala;
1285                     set++;
1286                     break;
1287                   }
1288                 }
1289                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1290               }
1291             }
1292           });
1293       });
1294     Kokkos::fence();
1295 
1296     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1297         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1298         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1299         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1300         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1301         const PetscInt  start = field*nloc, end = start + nloc;
1302         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1303         // A22 panel update for each row A(1,:) and col A(:,1)
1304         for (int ii=start; ii<end-1; ii++) {
1305           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1306           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1307           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1308           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1309           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1310               PetscInt kIdx = j*Ni + field_block_idx;
1311               if (kIdx >= nzUi) /* void */ ;
1312               else {
1313                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1314                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1315                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1316                 size_t         st_idx;
1317                 // find and do L(k,i) = A(:k,i) / A(i,i)
1318                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1319                 // get column, there has got to be a better way
1320                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1321                     if (pjL[j] == ii) {
1322                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1323                       idx = j; // output
1324                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1325                     }
1326                 }, st_idx);
1327                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1328 #if defined(PETSC_USE_DEBUG)
1329                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1330 #endif
1331                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1332                 // U(i+1,:end)
1333                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1334                       PetscScalar Uij = baUi[uiIdx];
1335                       PetscInt    col = bjUi[uiIdx];
1336                       if (col==myk) {
1337                         // A_kk = A_kk - L_ki * U_ij(k)
1338                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1339                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1340                       } else {
1341                         PetscScalar    *start, *end, *pAkjv=NULL;
1342                         PetscInt       high, low;
1343                         const PetscInt *startj;
1344                         if (col<myk) { // L
1345                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1346                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1347                           start = pLki+1; // start at pLki+1, A22(myk,1)
1348                           startj= bj_d + bi_d[myk] + idx;
1349                           end   = ba_d + bi_d[myk+1];
1350                         } else {
1351                           PetscInt idx = bdiag_d[myk+1]+1;
1352                           start = ba_d + idx;
1353                           startj= bj_d + idx;
1354                           end   = ba_d + bdiag_d[myk];
1355                         }
1356                         // search for 'col', use bisection search - TODO
1357                         low  = 0;
1358                         high = (PetscInt)(end-start);
1359                         while (high-low > 5) {
1360                           int t = (low+high)/2;
1361                           if (startj[t] > col) high = t;
1362                           else                 low  = t;
1363                         }
1364                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1365                           if (startj[pAkjv-start] == col) break;
1366                         }
1367 #if defined(PETSC_USE_DEBUG)
1368                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1369 #endif
1370                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1371                       }
1372                     });
1373               }
1374             });
1375           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1376           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1377         } /* endof for (i=0; i<n; i++) { */
1378         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1379       });
1380     Kokkos::fence();
1381     Kokkos::deep_copy (h_flops_k, d_flops_k);
1382     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1383     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1384         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1385         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1386         /* Invert diagonal for simpler triangular solves */
1387         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1388             int i = start + outer_index*Ni + lg_rank%Ni;
1389             if (i < end) {
1390               PetscScalar *pv = ba_d + bdiag_d[i];
1391               *pv = 1.0/(*pv);
1392             }
1393           });
1394       });
1395   }
1396   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1397   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1398   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1399 
1400   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1401   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1402   if (b->inode.size) {
1403     B->ops->solve = MatSolve_SeqAIJ_Inode;
1404   } else if (row_identity && col_identity) {
1405     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1406   } else {
1407     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1408   }
1409   B->offloadmask = PETSC_OFFLOAD_GPU;
1410   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1411   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1412   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1413   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1414   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1415   B->assembled              = PETSC_TRUE;
1416   B->preallocated           = PETSC_TRUE;
1417 
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1422 {
1423   PetscErrorCode   ierr;
1424 
1425   PetscFunctionBegin;
1426   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1427   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1432 {
1433   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1434 
1435   PetscFunctionBegin;
1436   if (!factors->sptrsv_symbolic_completed) {
1437     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1438     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1439     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1440   }
1441   PetscFunctionReturn(0);
1442 }
1443 
1444 /* Check if we need to update factors etc for transpose solve */
1445 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1446 {
1447   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1448   MatColIdxType             n = A->rmap->n;
1449 
1450   PetscFunctionBegin;
1451   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1452     /* Update L^T and do sptrsv symbolic */
1453     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1454     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1455     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1456     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1457 
1458     KokkosKernels::Impl::transpose_matrix<
1459       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1460       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1461       MatRowMapKokkosView,DefaultExecutionSpace>(
1462         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1463         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1464 
1465     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1466       We have to sort the indices, until KK provides finer control options.
1467     */
1468     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1469       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1470         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1471 
1472     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1473 
1474     /* Update U^T and do sptrsv symbolic */
1475     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1476     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1477     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1478     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1479 
1480     KokkosKernels::Impl::transpose_matrix<
1481       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1482       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1483       MatRowMapKokkosView,DefaultExecutionSpace>(
1484         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1485         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1486 
1487     /* Sort indices. See comments above */
1488     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1489       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1490         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1491 
1492     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1493     factors->transpose_updated = PETSC_TRUE;
1494   }
1495   PetscFunctionReturn(0);
1496 }
1497 
1498 /* Solve Ax = b, with A = LU */
1499 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1500 {
1501   PetscErrorCode                 ierr;
1502   ConstPetscScalarKokkosView     bv;
1503   PetscScalarKokkosView          xv;
1504   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1505 
1506   PetscFunctionBegin;
1507   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1508   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1509   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1510   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1511   /* Solve L tmpv = b */
1512   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1513   /* Solve Ux = tmpv */
1514   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1515   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1516   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1517   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 /* Solve A^T x = b, where A^T = U^T L^T */
1522 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1523 {
1524   PetscErrorCode                 ierr;
1525   ConstPetscScalarKokkosView     bv;
1526   PetscScalarKokkosView          xv;
1527   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1528 
1529   PetscFunctionBegin;
1530   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1531   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1532   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1533   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1534   /* Solve U^T tmpv = b */
1535   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1536 
1537   /* Solve L^T x = tmpv */
1538   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1539   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1540   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1541   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1546 {
1547   PetscErrorCode                 ierr;
1548   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1549   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1550   PetscInt                       fill_lev = info->levels;
1551 
1552   PetscFunctionBegin;
1553   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1554   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1555 
1556   auto a_d = aijkok->a_dual.view_device();
1557   auto i_d = aijkok->i_dual.view_device();
1558   auto j_d = aijkok->j_dual.view_device();
1559 
1560   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1561 
1562   B->assembled                       = PETSC_TRUE;
1563   B->preallocated                    = PETSC_TRUE;
1564   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1565   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1566   B->ops->matsolve                   = NULL;
1567   B->ops->matsolvetranspose          = NULL;
1568   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1569 
1570   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1571   factors->transpose_updated         = PETSC_FALSE;
1572   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1573   /* TODO: log flops, but how to know that? */
1574   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1579 {
1580   PetscErrorCode                 ierr;
1581   Mat_SeqAIJKokkos               *aijkok;
1582   Mat_SeqAIJ                     *b;
1583   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1584   PetscInt                       fill_lev = info->levels;
1585   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1586   PetscInt                       n = A->rmap->n;
1587 
1588   PetscFunctionBegin;
1589   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1590   /* Rebuild factors */
1591   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1592   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1593 
1594   /* Create a spiluk handle and then do symbolic factorization */
1595   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1596   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1597 
1598   auto spiluk_handle = factors->kh.get_spiluk_handle();
1599 
1600   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1601   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1602   Kokkos::realloc(factors->iU_d,n+1);
1603   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1604 
1605   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1606   auto i_d = aijkok->i_dual.view_device();
1607   auto j_d = aijkok->j_dual.view_device();
1608   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1609   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1610 
1611   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1612   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1613   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1614   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1615 
1616   /* TODO: add options to select sptrsv algorithms */
1617   /* Create sptrsv handles for L, U and their transpose */
1618  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1619   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1620  #else
1621   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1622  #endif
1623 
1624   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1625   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1626   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1627   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1628 
1629   /* Fill fields of the factor matrix B */
1630   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1631   b     = (Mat_SeqAIJ*)B->data;
1632   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1633   B->info.fill_ratio_given  = info->fill;
1634   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1635 
1636   B->offloadmask            = PETSC_OFFLOAD_GPU;
1637   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1642 {
1643   PetscErrorCode   ierr;
1644   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1645   const PetscInt   nrows   = A->rmap->n;
1646 
1647   PetscFunctionBegin;
1648   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1649   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1650   // move B data into Kokkos
1651   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1652   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1653   {
1654     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1655     if (!baijkok->diag_d) {
1656       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1657       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1658       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1659     }
1660   }
1661   PetscFunctionReturn(0);
1662 }
1663 
1664 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1665 {
1666   PetscFunctionBegin;
1667   *type = MATSOLVERKOKKOS;
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1672 {
1673   PetscFunctionBegin;
1674   *type = MATSOLVERKOKKOSDEVICE;
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 /*MC
1679   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1680   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1681 
1682   Level: beginner
1683 
1684 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1685 M*/
1686 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1687 {
1688   PetscErrorCode ierr;
1689   PetscInt       n = A->rmap->n;
1690 
1691   PetscFunctionBegin;
1692   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1693   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1694   (*B)->factortype = ftype;
1695   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1696   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1697 
1698   if (ftype == MAT_FACTOR_LU) {
1699     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1700     (*B)->canuseordering         = PETSC_TRUE;
1701     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1702   } else if (ftype == MAT_FACTOR_ILU) {
1703     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1704     (*B)->canuseordering         = PETSC_FALSE;
1705     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1706   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1707 
1708   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1709   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1714 {
1715   PetscErrorCode ierr;
1716   PetscInt       n = A->rmap->n;
1717 
1718   PetscFunctionBegin;
1719   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1720   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1721   (*B)->factortype = ftype;
1722   (*B)->canuseordering = PETSC_TRUE;
1723   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1724   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1725 
1726   if (ftype == MAT_FACTOR_LU) {
1727     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1728     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1729   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1730 
1731   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1732   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1737 {
1738   PetscErrorCode ierr;
1739 
1740   PetscFunctionBegin;
1741   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1742   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1743   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 /* Utility to print out a KokkosCsrMatrix for debugging */
1748 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1749 {
1750   PetscErrorCode    ierr;
1751   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1752   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1753   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1754   const PetscInt    *i = iv.data();
1755   const PetscInt    *j = jv.data();
1756   const PetscScalar *a = av.data();
1757   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1758 
1759   PetscFunctionBegin;
1760   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1761   for (PetscInt k=0; k<m; k++) {
1762     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1763     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1764       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1765     }
1766     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1767   }
1768   PetscFunctionReturn(0);
1769 }
1770