xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision e831869d00a6b6bf5bd64f1ce9662fca56500707)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
85   PetscFunctionReturn(0);
86 }
87 
88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89 {
90   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91 
92   PetscFunctionBegin;
93   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
94    /* We do not expect one needs factors on host  */
95   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97   aijkok->a_dual.sync_host();
98   PetscFunctionReturn(0);
99 }
100 
101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102 {
103   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104 
105   PetscFunctionBegin;
106   if (aijkok) {
107     aijkok->a_dual.sync_host();
108     *array = aijkok->a_dual.view_host().data();
109   } else { /* Happens when calling MatSetValues on a newly created matrix */
110     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111   }
112   PetscFunctionReturn(0);
113 }
114 
115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116 {
117   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118 
119   PetscFunctionBegin;
120   if (aijkok) aijkok->a_dual.modify_host();
121   PetscFunctionReturn(0);
122 }
123 
124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127 
128   PetscFunctionBegin;
129   aijkok->a_dual.sync_host();
130   *array = aijkok->a_dual.view_host().data();
131   PetscFunctionReturn(0);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135 {
136   PetscFunctionBegin;
137   *array = NULL;
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142 {
143   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144 
145   PetscFunctionBegin;
146   *array = aijkok->a_dual.view_host().data();
147   PetscFunctionReturn(0);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151 {
152   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153 
154   PetscFunctionBegin;
155   aijkok->a_dual.clear_sync_state();
156   aijkok->a_dual.modify_host();
157   PetscFunctionReturn(0);
158 }
159 
160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162 {
163   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165 
166   PetscFunctionBegin;
167   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170   PetscFunctionReturn(0);
171 }
172 
173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175 {
176   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177 
178   PetscFunctionBegin;
179   if (aijkok && aijkok->device_mat_d.data()) {
180     *d_mat = aijkok->device_mat_d.data();
181   } else {
182     PetscErrorCode   ierr;
183     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184     *d_mat  = NULL;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 /* Generate the transpose on device and cache it internally */
190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191 {
192   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193 
194   PetscFunctionBegin;
195   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198     CHKERRCXX(aijkok->a_dual.sync_device());
199     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
201     aijkok->transpose_updated = PETSC_TRUE;
202   }
203   *csrmatT = &aijkok->csrmatT;
204   PetscFunctionReturn(0);
205 }
206 
207 /* Generate the Hermitian on device and cache it internally */
208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209 {
210   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211 
212   PetscFunctionBegin;
213   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215     CHKERRCXX(aijkok->a_dual.sync_device());
216     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218    #if defined(PETSC_USE_COMPLEX)
219     const auto& a = aijkok->csrmatH.values;
220     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221    #endif
222     aijkok->hermitian_updated = PETSC_TRUE;
223   }
224   *csrmatH = &aijkok->csrmatH;
225   PetscFunctionReturn(0);
226 }
227 
228 /* y = A x */
229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
230 {
231   PetscErrorCode                   ierr;
232   Mat_SeqAIJKokkos                 *aijkok;
233   ConstPetscScalarKokkosView       xv;
234   PetscScalarKokkosView            yv;
235 
236   PetscFunctionBegin;
237   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
240   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244   ierr   = WaitForKokkos();CHKERRQ(ierr);
245   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
247   PetscFunctionReturn(0);
248 }
249 
250 /* y = A^T x */
251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
252 {
253   PetscErrorCode                   ierr;
254   Mat_SeqAIJKokkos                 *aijkok;
255   const char                       *mode;
256   ConstPetscScalarKokkosView       xv;
257   PetscScalarKokkosView            yv;
258   KokkosCsrMatrix                  *csrmat;
259 
260   PetscFunctionBegin;
261   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264   if (A->form_explicit_transpose) {
265     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266     mode = "N";
267   } else {
268     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269     csrmat = &aijkok->csrmat;
270     mode = "T";
271   }
272   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275   ierr = WaitForKokkos();CHKERRQ(ierr);
276   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* y = A^H x */
281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
282 {
283   PetscErrorCode                   ierr;
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294   if (A->form_explicit_transpose) {
295     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296     mode = "N";
297   } else {
298     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299     csrmat = &aijkok->csrmat;
300     mode = "C";
301   }
302   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305   ierr = WaitForKokkos();CHKERRQ(ierr);
306   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 /* z = A x + y */
311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
312 {
313   PetscErrorCode                   ierr;
314   Mat_SeqAIJKokkos                 *aijkok;
315   ConstPetscScalarKokkosView       xv,yv;
316   PetscScalarKokkosView            zv;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
323   if (zz != yy) Kokkos::deep_copy(zv,yv);
324   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329   ierr = WaitForKokkos();CHKERRQ(ierr);
330   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 /* z = A^T x + y */
335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
336 {
337   PetscErrorCode                   ierr;
338   Mat_SeqAIJKokkos                 *aijkok;
339   const char                       *mode;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342   KokkosCsrMatrix                  *csrmat;
343 
344   PetscFunctionBegin;
345   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   if (A->form_explicit_transpose) {
351     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352     mode = "N";
353   } else {
354     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355     csrmat = &aijkok->csrmat;
356     mode = "T";
357   }
358   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362   ierr = WaitForKokkos();CHKERRQ(ierr);
363   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 
367 /* z = A^H x + y */
368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
369 {
370   PetscErrorCode                   ierr;
371   Mat_SeqAIJKokkos                 *aijkok;
372   const char                       *mode;
373   ConstPetscScalarKokkosView       xv,yv;
374   PetscScalarKokkosView            zv;
375   KokkosCsrMatrix                  *csrmat;
376 
377   PetscFunctionBegin;
378   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
382   if (zz != yy) Kokkos::deep_copy(zv,yv);
383   if (A->form_explicit_transpose) {
384     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385     mode = "N";
386   } else {
387     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388     csrmat = &aijkok->csrmat;
389     mode = "C";
390   }
391   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395   ierr = WaitForKokkos();CHKERRQ(ierr);
396   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401 {
402   PetscErrorCode            ierr;
403   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404 
405   PetscFunctionBegin;
406   switch (op) {
407     case MAT_FORM_EXPLICIT_TRANSPOSE:
408       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410       A->form_explicit_transpose = flg;
411       break;
412     default:
413       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414       break;
415   }
416   PetscFunctionReturn(0);
417 }
418 
419 /* Depending on reuse, either build a new mat, or use the existing mat */
420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
421 {
422   PetscErrorCode   ierr;
423   Mat_SeqAIJ       *aseq;
424 
425   PetscFunctionBegin;
426   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
429   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
441     }
442   }
443   PetscFunctionReturn(0);
444 }
445 
446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448  */
449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
450 {
451   PetscErrorCode        ierr;
452   Mat_SeqAIJ            *bseq;
453   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454   Mat                   mat;
455 
456   PetscFunctionBegin;
457   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459   mat  = *B;
460   if (A->assembled) {
461     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464     /* Now copy values to B if needed */
465     if (dupOption == MAT_COPY_VALUES) {
466       if (akok->a_dual.need_sync_device()) {
467         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468         bkok->a_dual.modify_host();
469       } else { /* If device has the latest data, we only copy data on device */
470         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471         bkok->a_dual.modify_device();
472       }
473     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475       bkok->a_dual.modify_host();
476     }
477     mat->spptr = bkok;
478   }
479 
480   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
488 {
489   PetscErrorCode    ierr;
490   Mat               At;
491   KokkosCsrMatrix   *internT,*csrmatT;
492   Mat_SeqAIJKokkos  *atkok,*bkok;
493 
494   PetscFunctionBegin;
495   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
496   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
497     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
498     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
499     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
500     if (reuse == MAT_INITIAL_MATRIX) *B = At;
501     else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
502   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
503     if ((*B)->assembled) {
504       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
505       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
506       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
507     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
508       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
509       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
510       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
511       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
512       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
513     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
514   }
515   PetscFunctionReturn(0);
516 }
517 
518 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
519 {
520   PetscErrorCode             ierr;
521   Mat_SeqAIJKokkos           *aijkok;
522 
523   PetscFunctionBegin;
524   if (A->factortype == MAT_FACTOR_NONE) {
525     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
526     if (aijkok) {
527       if (aijkok->device_mat_d.data()) {
528         delete aijkok->colmap_d;
529         delete aijkok->i_uncompressed_d;
530       }
531       if (aijkok->diag_d) delete aijkok->diag_d;
532     }
533     delete aijkok;
534   } else {
535     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
536   }
537   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
538   A->spptr = NULL;
539   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
540   PetscFunctionReturn(0);
541 }
542 
543 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
544 {
545   PetscErrorCode ierr;
546 
547   PetscFunctionBegin;
548   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
549   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
550   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 
554 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
555 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
556 {
557   PetscErrorCode               ierr;
558   Mat_SeqAIJ                   *a,*b;
559   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
560   MatScalarKokkosView          aa,ba,ca;
561   MatRowMapKokkosView          ai,bi,ci;
562   MatColIdxKokkosView          aj,bj,cj;
563   PetscInt                     m,n,nnz,aN;
564 
565   PetscFunctionBegin;
566   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
567   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
568   PetscValidPointer(C,4);
569   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
570   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
571   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
572   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
573 
574   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
575   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
576   a    = static_cast<Mat_SeqAIJ*>(A->data);
577   b    = static_cast<Mat_SeqAIJ*>(B->data);
578   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
579   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
580   aa   = akok->a_dual.view_device();
581   ai   = akok->i_dual.view_device();
582   ba   = bkok->a_dual.view_device();
583   bi   = bkok->i_dual.view_device();
584   m    = A->rmap->n; /* M, N and nnz of C */
585   n    = A->cmap->n + B->cmap->n;
586   nnz  = a->nz + b->nz;
587   aN   = A->cmap->n; /* N of A */
588   if (reuse == MAT_INITIAL_MATRIX) {
589     aj = akok->j_dual.view_device();
590     bj = bkok->j_dual.view_device();
591     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
592     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
593     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
594     ca = ca_dual.view_device();
595     ci = ci_dual.view_device();
596     cj = cj_dual.view_device();
597 
598     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
599     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600       PetscInt i = t.league_rank(); /* row i */
601       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602 
603       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
604         ci(i) = coffset;
605         if (i == m-1) ci(m) = ai(m) + bi(m);
606       });
607 
608       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
609         if (k < alen) {
610           ca(coffset+k) = aa(ai(i)+k);
611           cj(coffset+k) = aj(ai(i)+k);
612         } else {
613           ca(coffset+k) = ba(bi(i)+k-alen);
614           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
615         }
616       });
617     });
618     ca_dual.modify_device();
619     ci_dual.modify_device();
620     cj_dual.modify_device();
621     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
622     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
623   } else if (reuse == MAT_REUSE_MATRIX) {
624     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
625     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
626     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
627     ca   = ckok->a_dual.view_device();
628     ci   = ckok->i_dual.view_device();
629 
630     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
631       PetscInt i = t.league_rank(); /* row i */
632       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
633       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
635         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
636       });
637     });
638     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
639   }
640   PetscFunctionReturn(0);
641 }
642 
643 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
644 {
645   PetscFunctionBegin;
646   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
647   PetscFunctionReturn(0);
648 }
649 
650 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
651 {
652   PetscErrorCode                 ierr;
653   Mat_Product                    *product = C->product;
654   Mat                            A,B;
655   bool                           transA,transB; /* use bool, since KK needs this type */
656   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
657   Mat_SeqAIJ                     *c;
658   MatProductData_SeqAIJKokkos    *pdata;
659   KokkosCsrMatrix                *csrmatA,*csrmatB;
660 
661   PetscFunctionBegin;
662   MatCheckProduct(C,1);
663   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
664   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
665 
666   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
667     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
668     PetscFunctionReturn(0);
669   }
670 
671   switch (product->type) {
672     case MATPRODUCT_AB:  transA = false; transB = false; break;
673     case MATPRODUCT_AtB: transA = true;  transB = false; break;
674     case MATPRODUCT_ABt: transA = false; transB = true;  break;
675     default:
676       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
677   }
678 
679   A     = product->A;
680   B     = product->B;
681   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
682   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
683   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
684   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
685   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
686 
687   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
688 
689   csrmatA = &akok->csrmat;
690   csrmatB = &bkok->csrmat;
691 
692   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
693   if (transA) {
694     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
695     transA = false;
696   }
697 
698   if (transB) {
699     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
700     transB = false;
701   }
702 
703   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
704   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
705   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
706   /* shorter version of MatAssemblyEnd_SeqAIJ */
707   c = (Mat_SeqAIJ*)C->data;
708   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
709   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
710   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
711   c->reallocs         = 0;
712   C->info.mallocs     = 0;
713   C->info.nz_unneeded = 0;
714   C->assembled        = C->was_assembled = PETSC_TRUE;
715   C->num_ass++;
716   PetscFunctionReturn(0);
717 }
718 
719 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
720 {
721   PetscErrorCode                 ierr;
722   Mat_Product                    *product = C->product;
723   MatProductType                 ptype;
724   Mat                            A,B;
725   bool                           transA,transB;
726   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
727   MatProductData_SeqAIJKokkos    *pdata;
728   MPI_Comm                       comm;
729   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
730 
731   PetscFunctionBegin;
732   MatCheckProduct(C,1);
733   ierr = PetscObjectGetComm((PetscObject)C,&comm);
734   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
735   A       = product->A;
736   B       = product->B;
737   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
738   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
739   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
740   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
741   csrmatA = &akok->csrmat;
742   csrmatB = &bkok->csrmat;
743 
744   ptype   = product->type;
745   switch (ptype) {
746     case MATPRODUCT_AB:  transA = false; transB = false; break;
747     case MATPRODUCT_AtB: transA = true;  transB = false; break;
748     case MATPRODUCT_ABt: transA = false; transB = true;  break;
749     default:
750       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
751   }
752 
753   product->data = pdata = new MatProductData_SeqAIJKokkos();
754   pdata->kh.set_team_work_size(16);
755   pdata->kh.set_dynamic_scheduling(true);
756   pdata->reusesym = product->api_user;
757 
758   /* TODO: add command line options to select spgemm algorithms */
759   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
760 #if defined(PETSC_HAVE_CUDA)
761   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
763     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
764   #endif
765 #endif
766   pdata->kh.create_spgemm_handle(spgemm_alg);
767 
768   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
769   if (transA) {
770     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
771     transA = false;
772   }
773 
774   if (transB) {
775     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
776     transB = false;
777   }
778 
779   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
780   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
781     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
782     calling new Mat_SeqAIJKokkos().
783     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
784   */
785   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
786   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
787 
788   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
789   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
790   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
791   PetscFunctionReturn(0);
792 }
793 
794 /* handles sparse matrix matrix ops */
795 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
796 {
797   PetscErrorCode ierr;
798   Mat_Product    *product = mat->product;
799   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
800 
801   PetscFunctionBegin;
802   MatCheckProduct(mat,1);
803   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
804   if (product->type == MATPRODUCT_ABC) {
805     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
806   }
807   if (Biskok && Ciskok) {
808     switch (product->type) {
809     case MATPRODUCT_AB:
810     case MATPRODUCT_AtB:
811     case MATPRODUCT_ABt:
812       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
813       break;
814     case MATPRODUCT_PtAP:
815     case MATPRODUCT_RARt:
816     case MATPRODUCT_ABC:
817       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
818       break;
819     default:
820       break;
821     }
822   } else { /* fallback for AIJ */
823     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
824   }
825   PetscFunctionReturn(0);
826 }
827 
828 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
829 {
830   PetscErrorCode   ierr;
831   Mat_SeqAIJKokkos *aijkok;
832 
833   PetscFunctionBegin;
834   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
835   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
836   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
837   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
838   ierr = WaitForKokkos();CHKERRQ(ierr);
839   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 
843 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
844 {
845   PetscErrorCode   ierr;
846   Mat_SeqAIJKokkos *aijkok;
847 
848   PetscFunctionBegin;
849   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
850   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
851   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
852   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
853   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
854   PetscFunctionReturn(0);
855 }
856 
857 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
858 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
859 {
860   PetscErrorCode     ierr;
861   Mat_SeqAIJKokkos   *aijkok;
862 
863   PetscFunctionBegin;
864   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
865   PetscValidPointer(kv,2);
866   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
867   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
868   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
869   *kv    = aijkok->a_dual.view_device();
870   PetscFunctionReturn(0);
871 }
872 
873 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
874 {
875   PetscFunctionBegin;
876   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
877   PetscValidPointer(kv,2);
878   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
879   PetscFunctionReturn(0);
880 }
881 
882 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
883 {
884   PetscErrorCode     ierr;
885   Mat_SeqAIJKokkos   *aijkok;
886 
887   PetscFunctionBegin;
888   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
889   PetscValidPointer(kv,2);
890   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
891   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
892   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
893   *kv    = aijkok->a_dual.view_device();
894   PetscFunctionReturn(0);
895 }
896 
897 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
898 {
899   PetscErrorCode     ierr;
900 
901   PetscFunctionBegin;
902   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
903   PetscValidPointer(kv,2);
904   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
905   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
910 {
911   Mat_SeqAIJKokkos   *aijkok;
912 
913   PetscFunctionBegin;
914   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
915   PetscValidPointer(kv,2);
916   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
917   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
918   *kv    = aijkok->a_dual.view_device();
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
923 {
924   PetscErrorCode     ierr;
925 
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
931   PetscFunctionReturn(0);
932 }
933 
934 /* Computes Y += alpha X */
935 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
936 {
937   PetscErrorCode             ierr;
938   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
939   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
940   ConstMatScalarKokkosView   Xa;
941   MatScalarKokkosView        Ya;
942 
943   PetscFunctionBegin;
944   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
945   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
946   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
947   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
948 
949   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
950     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
951     PetscBool e;
952     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
953     if (e) {
954       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
955       if (e) pattern = SAME_NONZERO_PATTERN;
956     }
957   }
958 
959   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
960     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
961     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
962     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
963   */
964   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
965   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
966   Xa   = xkok->a_dual.view_device();
967   Ya   = ykok->a_dual.view_device();
968 
969   if (pattern == SAME_NONZERO_PATTERN) {
970     KokkosBlas::axpy(alpha,Xa,Ya);
971     ierr = MatSeqAIJKokkosModifyDevice(Y);
972   } else if (pattern == SUBSET_NONZERO_PATTERN) {
973     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
974     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
975 
976     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
977       PetscInt i = t.league_rank(); /* row i */
978       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
979         PetscInt p,q = Yi(i);
980         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
981           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
982           if (Xj(p) == Yj(q)) { /* Found it */
983             Ya(q) += alpha * Xa(p);
984             q++;
985           } else {
986             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
987                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
988             */
989             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
990           }
991         }
992       });
993     });
994     ierr = MatSeqAIJKokkosModifyDevice(Y);
995   } else { /* different nonzero patterns */
996     Mat             Z;
997     KokkosCsrMatrix zcsr;
998     KernelHandle    kh;
999     kh.create_spadd_handle(false);
1000     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1001     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1002     zkok = new Mat_SeqAIJKokkos(zcsr);
1003     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1004     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1005     kh.destroy_spadd_handle();
1006   }
1007 
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1012 {
1013   PetscErrorCode ierr;
1014 
1015   PetscFunctionBegin;
1016   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1017   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1018   B->offloadmask = PETSC_OFFLOAD_CPU;
1019   PetscFunctionReturn(0);
1020 }
1021 
1022 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1023 {
1024   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1025 
1026   PetscFunctionBegin;
1027   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1028   A->boundtocpu  = PETSC_FALSE;
1029 
1030   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1031   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1032   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1033   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1034   A->ops->scale                     = MatScale_SeqAIJKokkos;
1035   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1036   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1037   A->ops->mult                      = MatMult_SeqAIJKokkos;
1038   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1039   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1040   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1041   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1042   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1043   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1044   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1045   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1046   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1047   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1048   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1049   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1050   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1051   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1056 {
1057   PetscErrorCode     ierr;
1058   Mat_SeqAIJ         *aseq;
1059   PetscInt           i,m,n;
1060 
1061   PetscFunctionBegin;
1062   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1063 
1064   m    = akok->nrows();
1065   n    = akok->ncols();
1066   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1067   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1068 
1069   /* Set up data structures of A as a MATSEQAIJ */
1070   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1071   aseq = (Mat_SeqAIJ*)(A)->data;
1072 
1073   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1074   akok->j_dual.sync_host();
1075 
1076   aseq->i            = akok->i_host_data();
1077   aseq->j            = akok->j_host_data();
1078   aseq->a            = akok->a_host_data();
1079   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1080   aseq->singlemalloc = PETSC_FALSE;
1081   aseq->free_a       = PETSC_FALSE;
1082   aseq->free_ij      = PETSC_FALSE;
1083   aseq->nz           = akok->nnz();
1084   aseq->maxnz        = aseq->nz;
1085 
1086   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1087   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1088   for (i=0; i<m; i++) {
1089     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1090   }
1091 
1092   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1093   akok->nonzerostate = A->nonzerostate;
1094   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1095   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1096   A->spptr = akok;
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1101 
1102    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1103  */
1104 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1105 {
1106   PetscErrorCode     ierr;
1107 
1108   PetscFunctionBegin;
1109   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1110   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 /* --------------------------------------------------------------------------------*/
1115 /*@C
1116    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1117    (the default parallel PETSc format). This matrix will ultimately be handled by
1118    Kokkos for calculations. For good matrix
1119    assembly performance the user should preallocate the matrix storage by setting
1120    the parameter nz (or the array nnz).  By setting these parameters accurately,
1121    performance during matrix assembly can be increased by more than a factor of 50.
1122 
1123    Collective
1124 
1125    Input Parameters:
1126 +  comm - MPI communicator, set to PETSC_COMM_SELF
1127 .  m - number of rows
1128 .  n - number of columns
1129 .  nz - number of nonzeros per row (same for all rows)
1130 -  nnz - array containing the number of nonzeros in the various rows
1131          (possibly different for each row) or NULL
1132 
1133    Output Parameter:
1134 .  A - the matrix
1135 
1136    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1137    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1138    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1139 
1140    Notes:
1141    If nnz is given then nz is ignored
1142 
1143    The AIJ format (also called the Yale sparse matrix format or
1144    compressed row storage), is fully compatible with standard Fortran 77
1145    storage.  That is, the stored row and column indices can begin at
1146    either one (as in Fortran) or zero.  See the users' manual for details.
1147 
1148    Specify the preallocated storage with either nz or nnz (not both).
1149    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1150    allocation.  For large problems you MUST preallocate memory or you
1151    will get TERRIBLE performance, see the users' manual chapter on matrices.
1152 
1153    By default, this format uses inodes (identical nodes) when possible, to
1154    improve numerical efficiency of matrix-vector products and solves. We
1155    search for consecutive rows with the same nonzero structure, thereby
1156    reusing matrix information to achieve increased efficiency.
1157 
1158    Level: intermediate
1159 
1160 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1161 @*/
1162 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1163 {
1164   PetscErrorCode ierr;
1165 
1166   PetscFunctionBegin;
1167   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1168   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1169   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1170   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1171   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 typedef Kokkos::TeamPolicy<>::member_type team_member;
1176 //
1177 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1178 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1179 //
1180 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1181 {
1182   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1183   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1184   IS                 isrow = b->row,isicol = b->icol;
1185   PetscErrorCode     ierr;
1186   const PetscInt     *r_h,*ic_h;
1187   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1188   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1189   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1190   PetscBool          row_identity,col_identity;
1191   PetscInt           nc, Nf, nVec=32; // should be a parameter
1192   PetscContainer     container;
1193 
1194   PetscFunctionBegin;
1195   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
1196   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1197   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1198   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1199   if (container) {
1200     PetscInt *pNf=NULL, nv;
1201     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1202     Nf = (*pNf)%1000;
1203     nv = (*pNf)/1000;
1204     if (nv>0) nVec = nv;
1205   } else Nf = 1;
1206   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
1207   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1208   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1209   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1210 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1212 #endif
1213   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1214   {
1215 #define KOKKOS_SHARED_LEVEL 1
1216     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1217     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1218     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1219     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1220     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1221     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1222     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1223     size_t flops_h = 0.0;
1224     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1225     Kokkos::View<size_t> d_flops_k ("flops");
1226     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1227     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1228     Kokkos::deep_copy (d_flops_k, h_flops_k);
1229     Kokkos::deep_copy (d_r_k, h_r_k);
1230     Kokkos::deep_copy (d_ic_k, h_ic_k);
1231     // Fill A --> fact
1232     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1233         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1234         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1235         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1236         // zero rows of B
1237         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1238             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1239             PetscScalar *baL = ba_d + bi_d[rowb];
1240             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1241             /* zero (unfactored row) */
1242             for (int j=0;j<nzbL;j++) baL[j] = 0;
1243             for (int j=0;j<nzbU;j++) baU[j] = 0;
1244           });
1245         // copy A into B
1246         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1247             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1248             const PetscScalar *av    = aa_d + ai_d[rowa];
1249             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1250             /* load in initial (unfactored row) */
1251             for (int j=0;j<nza;j++) {
1252               PetscInt    colb = ic[ajtmp[j]];
1253               PetscScalar vala = av[j];
1254               if (colb == rowb) {
1255                 *(ba_d + bdiag_d[rowb]) = vala;
1256               } else {
1257                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1258                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1259                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1260                 for (int j=0; j<nz ; j++) {
1261                   if (pbj[j] == colb) {
1262                     pba[j] = vala;
1263                     set++;
1264                     break;
1265                   }
1266                 }
1267                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1268               }
1269             }
1270           });
1271       });
1272     Kokkos::fence();
1273 
1274     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1275         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1276         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1277         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1278         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1279         const PetscInt  start = field*nloc, end = start + nloc;
1280         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1281         // A22 panel update for each row A(1,:) and col A(:,1)
1282         for (int ii=start; ii<end-1; ii++) {
1283           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1284           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1285           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1286           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1287           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1288               PetscInt kIdx = j*Ni + field_block_idx;
1289               if (kIdx >= nzUi) /* void */ ;
1290               else {
1291                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1292                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1293                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1294                 size_t         st_idx;
1295                 // find and do L(k,i) = A(:k,i) / A(i,i)
1296                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1297                 // get column, there has got to be a better way
1298                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1299                     if (pjL[j] == ii) {
1300                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1301                       idx = j; // output
1302                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1303                     }
1304                 }, st_idx);
1305                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1306 #if defined(PETSC_USE_DEBUG)
1307                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1308 #endif
1309                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1310                 // U(i+1,:end)
1311                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1312                       PetscScalar Uij = baUi[uiIdx];
1313                       PetscInt    col = bjUi[uiIdx];
1314                       if (col==myk) {
1315                         // A_kk = A_kk - L_ki * U_ij(k)
1316                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1317                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1318                       } else {
1319                         PetscScalar    *start, *end, *pAkjv=NULL;
1320                         PetscInt       high, low;
1321                         const PetscInt *startj;
1322                         if (col<myk) { // L
1323                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1324                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1325                           start = pLki+1; // start at pLki+1, A22(myk,1)
1326                           startj= bj_d + bi_d[myk] + idx;
1327                           end   = ba_d + bi_d[myk+1];
1328                         } else {
1329                           PetscInt idx = bdiag_d[myk+1]+1;
1330                           start = ba_d + idx;
1331                           startj= bj_d + idx;
1332                           end   = ba_d + bdiag_d[myk];
1333                         }
1334                         // search for 'col', use bisection search - TODO
1335                         low  = 0;
1336                         high = (PetscInt)(end-start);
1337                         while (high-low > 5) {
1338                           int t = (low+high)/2;
1339                           if (startj[t] > col) high = t;
1340                           else                 low  = t;
1341                         }
1342                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1343                           if (startj[pAkjv-start] == col) break;
1344                         }
1345 #if defined(PETSC_USE_DEBUG)
1346                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1347 #endif
1348                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1349                       }
1350                     });
1351               }
1352             });
1353           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1354           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1355         } /* endof for (i=0; i<n; i++) { */
1356         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1357       });
1358     Kokkos::fence();
1359     Kokkos::deep_copy (h_flops_k, d_flops_k);
1360 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1361     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1362 #elif  defined(PETSC_USE_LOG)
1363     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1364 #endif
1365     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1366         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1367         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1368         /* Invert diagonal for simpler triangular solves */
1369         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1370             int i = start + outer_index*Ni + lg_rank%Ni;
1371             if (i < end) {
1372               PetscScalar *pv = ba_d + bdiag_d[i];
1373               *pv = 1.0/(*pv);
1374             }
1375           });
1376       });
1377   }
1378 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1379   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1380 #endif
1381   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1382   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1383 
1384   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1385   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1386   if (b->inode.size) {
1387     B->ops->solve = MatSolve_SeqAIJ_Inode;
1388   } else if (row_identity && col_identity) {
1389     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1390   } else {
1391     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1392   }
1393   B->offloadmask = PETSC_OFFLOAD_GPU;
1394   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1395   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1396   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1397   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1398   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1399   B->assembled              = PETSC_TRUE;
1400   B->preallocated           = PETSC_TRUE;
1401 
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1406 {
1407   PetscErrorCode   ierr;
1408 
1409   PetscFunctionBegin;
1410   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1411   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1416 {
1417   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1418 
1419   PetscFunctionBegin;
1420   if (!factors->sptrsv_symbolic_completed) {
1421     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1422     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1423     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1424   }
1425   PetscFunctionReturn(0);
1426 }
1427 
1428 /* Check if we need to update factors etc for transpose solve */
1429 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1430 {
1431   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1432   MatColIdxType             n = A->rmap->n;
1433 
1434   PetscFunctionBegin;
1435   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1436     /* Update L^T and do sptrsv symbolic */
1437     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1438     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1439     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1440     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1441 
1442     KokkosKernels::Impl::transpose_matrix<
1443       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1444       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1445       MatRowMapKokkosView,DefaultExecutionSpace>(
1446         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1447         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1448 
1449     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1450       We have to sort the indices, until KK provides finer control options.
1451     */
1452     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1453       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1454         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1455 
1456     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1457 
1458     /* Update U^T and do sptrsv symbolic */
1459     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1460     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1461     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1462     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1463 
1464     KokkosKernels::Impl::transpose_matrix<
1465       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1466       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1467       MatRowMapKokkosView,DefaultExecutionSpace>(
1468         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1469         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1470 
1471     /* Sort indices. See comments above */
1472     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1473       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1474         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1475 
1476     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1477     factors->transpose_updated = PETSC_TRUE;
1478   }
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 /* Solve Ax = b, with A = LU */
1483 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1484 {
1485   PetscErrorCode                 ierr;
1486   ConstPetscScalarKokkosView     bv;
1487   PetscScalarKokkosView          xv;
1488   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1489 
1490   PetscFunctionBegin;
1491   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1492   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1493   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1494   /* Solve L tmpv = b */
1495   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1496   /* Solve Ux = tmpv */
1497   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1498   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1499   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 /* Solve A^T x = b, where A^T = U^T L^T */
1504 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1505 {
1506   PetscErrorCode                 ierr;
1507   ConstPetscScalarKokkosView     bv;
1508   PetscScalarKokkosView          xv;
1509   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1510 
1511   PetscFunctionBegin;
1512   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1513   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1514   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1515   /* Solve U^T tmpv = b */
1516   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1517 
1518   /* Solve L^T x = tmpv */
1519   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1520   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1521   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1522   PetscFunctionReturn(0);
1523 }
1524 
1525 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1526 {
1527   PetscErrorCode                 ierr;
1528   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1529   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1530   PetscInt                       fill_lev = info->levels;
1531 
1532   PetscFunctionBegin;
1533   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1534 
1535   auto a_d = aijkok->a_dual.view_device();
1536   auto i_d = aijkok->i_dual.view_device();
1537   auto j_d = aijkok->j_dual.view_device();
1538 
1539   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1540 
1541   B->assembled                       = PETSC_TRUE;
1542   B->preallocated                    = PETSC_TRUE;
1543   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1544   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1545   B->ops->matsolve                   = NULL;
1546   B->ops->matsolvetranspose          = NULL;
1547   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1548 
1549   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1550   factors->transpose_updated         = PETSC_FALSE;
1551   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1556 {
1557   PetscErrorCode                 ierr;
1558   Mat_SeqAIJKokkos               *aijkok;
1559   Mat_SeqAIJ                     *b;
1560   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1561   PetscInt                       fill_lev = info->levels;
1562   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1563   PetscInt                       n = A->rmap->n;
1564 
1565   PetscFunctionBegin;
1566   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1567   /* Rebuild factors */
1568   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1569   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1570 
1571   /* Create a spiluk handle and then do symbolic factorization */
1572   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1573   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1574 
1575   auto spiluk_handle = factors->kh.get_spiluk_handle();
1576 
1577   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1578   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1579   Kokkos::realloc(factors->iU_d,n+1);
1580   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1581 
1582   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1583   auto i_d = aijkok->i_dual.view_device();
1584   auto j_d = aijkok->j_dual.view_device();
1585   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1586   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1587 
1588   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1589   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1590   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1591   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1592 
1593   /* TODO: add options to select sptrsv algorithms */
1594   /* Create sptrsv handles for L, U and their transpose */
1595  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1596   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1597  #else
1598   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1599  #endif
1600 
1601   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1602   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1603   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1604   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1605 
1606   /* Fill fields of the factor matrix B */
1607   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1608   b     = (Mat_SeqAIJ*)B->data;
1609   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1610   B->info.fill_ratio_given  = info->fill;
1611   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1612 
1613   B->offloadmask            = PETSC_OFFLOAD_GPU;
1614   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1619 {
1620   PetscErrorCode   ierr;
1621   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1622   const PetscInt   nrows   = A->rmap->n;
1623 
1624   PetscFunctionBegin;
1625   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1626   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1627   // move B data into Kokkos
1628   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1629   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1630   {
1631     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1632     if (!baijkok->diag_d) {
1633       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1634       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1635       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1636     }
1637   }
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1642 {
1643   PetscFunctionBegin;
1644   *type = MATSOLVERKOKKOS;
1645   PetscFunctionReturn(0);
1646 }
1647 
1648 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1649 {
1650   PetscFunctionBegin;
1651   *type = MATSOLVERKOKKOSDEVICE;
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 /*MC
1656   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1657   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1658 
1659   Level: beginner
1660 
1661 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1662 M*/
1663 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1664 {
1665   PetscErrorCode ierr;
1666   PetscInt       n = A->rmap->n;
1667 
1668   PetscFunctionBegin;
1669   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1670   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1671   (*B)->factortype = ftype;
1672   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1673   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1674 
1675   if (ftype == MAT_FACTOR_LU) {
1676     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1677     (*B)->canuseordering         = PETSC_TRUE;
1678     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1679   } else if (ftype == MAT_FACTOR_ILU) {
1680     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1681     (*B)->canuseordering         = PETSC_FALSE;
1682     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1683   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1684 
1685   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1686   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1691 {
1692   PetscErrorCode ierr;
1693   PetscInt       n = A->rmap->n;
1694 
1695   PetscFunctionBegin;
1696   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1697   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1698   (*B)->factortype = ftype;
1699   (*B)->canuseordering = PETSC_TRUE;
1700   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1701   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1702 
1703   if (ftype == MAT_FACTOR_LU) {
1704     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1705     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1706   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1707 
1708   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1709   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1714 {
1715   PetscErrorCode ierr;
1716 
1717   PetscFunctionBegin;
1718   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1719   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1720   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 /* Utility to print out a KokkosCsrMatrix for debugging */
1725 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1726 {
1727   PetscErrorCode    ierr;
1728   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1729   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1730   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1731   const PetscInt    *i = iv.data();
1732   const PetscInt    *j = jv.data();
1733   const PetscScalar *a = av.data();
1734   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1735 
1736   PetscFunctionBegin;
1737   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1738   for (PetscInt k=0; k<m; k++) {
1739     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1740     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1741       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1742     }
1743     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1744   }
1745   PetscFunctionReturn(0);
1746 }
1747