xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision de40df40cf67d557bacf774f8a40b904b72e5e2e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/aij.h>
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
22 
23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25    In the latter case, it is important to set a_dual's sync state correctly.
26  */
27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
28 {
29   PetscErrorCode    ierr;
30   Mat_SeqAIJ        *aijseq;
31   Mat_SeqAIJKokkos  *aijkok;
32 
33   PetscFunctionBegin;
34   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
35   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36 
37   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39 
40   /* If aijkok does not exist, we just copy i, j to device.
41      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42      In both cases, we build a new aijkok structure.
43   */
44   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45     delete aijkok;
46     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47     A->spptr = aijkok;
48   }
49 
50   if (aijkok && aijkok->device_mat_d.data()) {
51     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52   }
53   PetscFunctionReturn(0);
54 }
55 
56 /* Sync CSR data to device if not yet */
57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
58 {
59   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60 
61   PetscFunctionBegin;
62   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65   if (aijkok->a_dual.need_sync_device()) {
66     aijkok->a_dual.sync_device();
67     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
68     aijkok->hermitian_updated = PETSC_FALSE;
69   }
70   PetscFunctionReturn(0);
71 }
72 
73 /* Mark the CSR data on device as modified */
74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
75 {
76   PetscErrorCode   ierr;
77   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
78 
79   PetscFunctionBegin;
80   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
81   aijkok->a_dual.clear_sync_state();
82   aijkok->a_dual.modify_device();
83   aijkok->transpose_updated = PETSC_FALSE;
84   aijkok->hermitian_updated = PETSC_FALSE;
85   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
86   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
87   PetscFunctionReturn(0);
88 }
89 
90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91 {
92   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93 
94   PetscFunctionBegin;
95   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
96    /* We do not expect one needs factors on host  */
97   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99   aijkok->a_dual.sync_host();
100   PetscFunctionReturn(0);
101 }
102 
103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104 {
105   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106 
107   PetscFunctionBegin;
108   if (aijkok) {
109     aijkok->a_dual.sync_host();
110     *array = aijkok->a_dual.view_host().data();
111   } else { /* Happens when calling MatSetValues on a newly created matrix */
112     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118 {
119   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120 
121   PetscFunctionBegin;
122   if (aijkok) aijkok->a_dual.modify_host();
123   PetscFunctionReturn(0);
124 }
125 
126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok) {
132     aijkok->a_dual.sync_host();
133     *array = aijkok->a_dual.view_host().data();
134   } else {
135     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141 {
142   PetscFunctionBegin;
143   *array = NULL;
144   PetscFunctionReturn(0);
145 }
146 
147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148 {
149   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150 
151   PetscFunctionBegin;
152   if (aijkok) {
153     *array = aijkok->a_dual.view_host().data();
154   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
155     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161 {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok) {
166     aijkok->a_dual.clear_sync_state();
167     aijkok->a_dual.modify_host();
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174 {
175   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177 
178   PetscFunctionBegin;
179   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187 {
188   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189 
190   PetscFunctionBegin;
191   if (aijkok && aijkok->device_mat_d.data()) {
192     *d_mat = aijkok->device_mat_d.data();
193   } else {
194     PetscErrorCode   ierr;
195     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196     *d_mat  = NULL;
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 /* Generate the transpose on device and cache it internally */
202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203 {
204   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210     CHKERRCXX(aijkok->a_dual.sync_device());
211     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
213     aijkok->transpose_updated = PETSC_TRUE;
214   }
215   *csrmatT = &aijkok->csrmatT;
216   PetscFunctionReturn(0);
217 }
218 
219 /* Generate the Hermitian on device and cache it internally */
220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221 {
222   PetscErrorCode                   ierr;
223   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
224 
225   PetscFunctionBegin;
226   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
227   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
228   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
229     CHKERRCXX(aijkok->a_dual.sync_device());
230     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
231     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
232    #if defined(PETSC_USE_COMPLEX)
233     const auto& a = aijkok->csrmatH.values;
234     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
235    #endif
236     aijkok->hermitian_updated = PETSC_TRUE;
237   }
238   *csrmatH = &aijkok->csrmatH;
239   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 /* y = A x */
244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
245 {
246   PetscErrorCode                   ierr;
247   Mat_SeqAIJKokkos                 *aijkok;
248   ConstPetscScalarKokkosView       xv;
249   PetscScalarKokkosView            yv;
250 
251   PetscFunctionBegin;
252   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
255   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
257   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
258   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
259   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
260   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
261   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /* y = A^T x */
266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
267 {
268   PetscErrorCode                   ierr;
269   Mat_SeqAIJKokkos                 *aijkok;
270   const char                       *mode;
271   ConstPetscScalarKokkosView       xv;
272   PetscScalarKokkosView            yv;
273   KokkosCsrMatrix                  *csrmat;
274 
275   PetscFunctionBegin;
276   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
277   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
280   if (A->form_explicit_transpose) {
281     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282     mode = "N";
283   } else {
284     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285     csrmat = &aijkok->csrmat;
286     mode = "T";
287   }
288   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
291   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /* y = A^H x */
297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
298 {
299   PetscErrorCode                   ierr;
300   Mat_SeqAIJKokkos                 *aijkok;
301   const char                       *mode;
302   ConstPetscScalarKokkosView       xv;
303   PetscScalarKokkosView            yv;
304   KokkosCsrMatrix                  *csrmat;
305 
306   PetscFunctionBegin;
307   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
308   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
311   if (A->form_explicit_transpose) {
312     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313     mode = "N";
314   } else {
315     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316     csrmat = &aijkok->csrmat;
317     mode = "C";
318   }
319   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 /* z = A x + y */
328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
329 {
330   PetscErrorCode                   ierr;
331   Mat_SeqAIJKokkos                 *aijkok;
332   ConstPetscScalarKokkosView       xv,yv;
333   PetscScalarKokkosView            zv;
334 
335   PetscFunctionBegin;
336   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
337   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
341   if (zz != yy) Kokkos::deep_copy(zv,yv);
342   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 /* z = A^T x + y */
353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
354 {
355   PetscErrorCode                   ierr;
356   Mat_SeqAIJKokkos                 *aijkok;
357   const char                       *mode;
358   ConstPetscScalarKokkosView       xv,yv;
359   PetscScalarKokkosView            zv;
360   KokkosCsrMatrix                  *csrmat;
361 
362   PetscFunctionBegin;
363   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
364   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
368   if (zz != yy) Kokkos::deep_copy(zv,yv);
369   if (A->form_explicit_transpose) {
370     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374     csrmat = &aijkok->csrmat;
375     mode = "T";
376   }
377   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
381   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /* z = A^H x + y */
387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   PetscErrorCode                   ierr;
390   Mat_SeqAIJKokkos                 *aijkok;
391   const char                       *mode;
392   ConstPetscScalarKokkosView       xv,yv;
393   PetscScalarKokkosView            zv;
394   KokkosCsrMatrix                  *csrmat;
395 
396   PetscFunctionBegin;
397   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
398   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
402   if (zz != yy) Kokkos::deep_copy(zv,yv);
403   if (A->form_explicit_transpose) {
404     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode = "C";
410   }
411   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
415   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421 {
422   PetscErrorCode            ierr;
423   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424 
425   PetscFunctionBegin;
426   switch (op) {
427     case MAT_FORM_EXPLICIT_TRANSPOSE:
428       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430       A->form_explicit_transpose = flg;
431       break;
432     default:
433       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434       break;
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Depending on reuse, either build a new mat, or use the existing mat */
440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
441 {
442   PetscErrorCode   ierr;
443   Mat_SeqAIJ       *aseq;
444 
445   PetscFunctionBegin;
446   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
449   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
459       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
461     }
462   }
463   PetscFunctionReturn(0);
464 }
465 
466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468  */
469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
470 {
471   PetscErrorCode        ierr;
472   Mat_SeqAIJ            *bseq;
473   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474   Mat                   mat;
475 
476   PetscFunctionBegin;
477   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479   mat  = *B;
480   if (A->assembled) {
481     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484     /* Now copy values to B if needed */
485     if (dupOption == MAT_COPY_VALUES) {
486       if (akok->a_dual.need_sync_device()) {
487         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488         bkok->a_dual.modify_host();
489       } else { /* If device has the latest data, we only copy data on device */
490         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491         bkok->a_dual.modify_device();
492       }
493     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495       bkok->a_dual.modify_host();
496     }
497     mat->spptr = bkok;
498   }
499 
500   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
508 {
509   PetscErrorCode    ierr;
510   Mat               At;
511   KokkosCsrMatrix   *internT,*csrmatT;
512   Mat_SeqAIJKokkos  *atkok,*bkok;
513 
514   PetscFunctionBegin;
515   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
516   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
518     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
519     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
520     if (reuse == MAT_INITIAL_MATRIX) *B = At;
521     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
522   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
523     if ((*B)->assembled) {
524       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
525       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
526       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
527     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
528       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
529       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
530       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
531       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
532       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
533     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
539 {
540   PetscErrorCode             ierr;
541   Mat_SeqAIJKokkos           *aijkok;
542 
543   PetscFunctionBegin;
544   if (A->factortype == MAT_FACTOR_NONE) {
545     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546     if (aijkok) {
547       if (aijkok->device_mat_d.data()) {
548         delete aijkok->colmap_d;
549         delete aijkok->i_uncompressed_d;
550       }
551       if (aijkok->diag_d) delete aijkok->diag_d;
552     }
553     delete aijkok;
554   } else {
555     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
556   }
557   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
558   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
559   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
560   A->spptr = NULL;
561   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
565 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
566 {
567   PetscErrorCode ierr;
568 
569   PetscFunctionBegin;
570   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
571   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
572   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
577 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
578 {
579   PetscErrorCode               ierr;
580   Mat_SeqAIJ                   *a,*b;
581   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
582   MatScalarKokkosView          aa,ba,ca;
583   MatRowMapKokkosView          ai,bi,ci;
584   MatColIdxKokkosView          aj,bj,cj;
585   PetscInt                     m,n,nnz,aN;
586 
587   PetscFunctionBegin;
588   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
589   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
590   PetscValidPointer(C,4);
591   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
592   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
593   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
594   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
595 
596   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
597   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
598   a    = static_cast<Mat_SeqAIJ*>(A->data);
599   b    = static_cast<Mat_SeqAIJ*>(B->data);
600   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
601   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
602   aa   = akok->a_dual.view_device();
603   ai   = akok->i_dual.view_device();
604   ba   = bkok->a_dual.view_device();
605   bi   = bkok->i_dual.view_device();
606   m    = A->rmap->n; /* M, N and nnz of C */
607   n    = A->cmap->n + B->cmap->n;
608   nnz  = a->nz + b->nz;
609   aN   = A->cmap->n; /* N of A */
610   if (reuse == MAT_INITIAL_MATRIX) {
611     aj = akok->j_dual.view_device();
612     bj = bkok->j_dual.view_device();
613     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
614     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
615     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
616     ca = ca_dual.view_device();
617     ci = ci_dual.view_device();
618     cj = cj_dual.view_device();
619 
620     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
621     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
622       PetscInt i = t.league_rank(); /* row i */
623       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
624 
625       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
626         ci(i) = coffset;
627         if (i == m-1) ci(m) = ai(m) + bi(m);
628       });
629 
630       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
631         if (k < alen) {
632           ca(coffset+k) = aa(ai(i)+k);
633           cj(coffset+k) = aj(ai(i)+k);
634         } else {
635           ca(coffset+k) = ba(bi(i)+k-alen);
636           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
637         }
638       });
639     });
640     ca_dual.modify_device();
641     ci_dual.modify_device();
642     cj_dual.modify_device();
643     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
644     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
645   } else if (reuse == MAT_REUSE_MATRIX) {
646     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
647     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
648     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
649     ca   = ckok->a_dual.view_device();
650     ci   = ckok->i_dual.view_device();
651 
652     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
653       PetscInt i = t.league_rank(); /* row i */
654       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
655       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
656         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
657         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
658       });
659     });
660     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
661   }
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
666 {
667   PetscFunctionBegin;
668   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
673 {
674   PetscErrorCode                 ierr;
675   Mat_Product                    *product = C->product;
676   Mat                            A,B;
677   bool                           transA,transB; /* use bool, since KK needs this type */
678   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
679   Mat_SeqAIJ                     *c;
680   MatProductData_SeqAIJKokkos    *pdata;
681   KokkosCsrMatrix                *csrmatA,*csrmatB;
682 
683   PetscFunctionBegin;
684   MatCheckProduct(C,1);
685   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
686   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
687 
688   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
689     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
690     PetscFunctionReturn(0);
691   }
692 
693   switch (product->type) {
694     case MATPRODUCT_AB:  transA = false; transB = false; break;
695     case MATPRODUCT_AtB: transA = true;  transB = false; break;
696     case MATPRODUCT_ABt: transA = false; transB = true;  break;
697     default:
698       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
699   }
700 
701   A     = product->A;
702   B     = product->B;
703   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
704   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
705   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
706   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
707   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
708 
709   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
710 
711   csrmatA = &akok->csrmat;
712   csrmatB = &bkok->csrmat;
713 
714   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
715   if (transA) {
716     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
717     transA = false;
718   }
719 
720   if (transB) {
721     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
722     transB = false;
723   }
724   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
725   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
726   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
727   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
728   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
729   /* shorter version of MatAssemblyEnd_SeqAIJ */
730   c = (Mat_SeqAIJ*)C->data;
731   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
732   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
733   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
734   c->reallocs         = 0;
735   C->info.mallocs     = 0;
736   C->info.nz_unneeded = 0;
737   C->assembled        = C->was_assembled = PETSC_TRUE;
738   C->num_ass++;
739   PetscFunctionReturn(0);
740 }
741 
742 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
743 {
744   PetscErrorCode                 ierr;
745   Mat_Product                    *product = C->product;
746   MatProductType                 ptype;
747   Mat                            A,B;
748   bool                           transA,transB;
749   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
750   MatProductData_SeqAIJKokkos    *pdata;
751   MPI_Comm                       comm;
752   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
753 
754   PetscFunctionBegin;
755   MatCheckProduct(C,1);
756   ierr = PetscObjectGetComm((PetscObject)C,&comm);
757   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
758   A       = product->A;
759   B       = product->B;
760   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
761   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
762   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
763   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
764   csrmatA = &akok->csrmat;
765   csrmatB = &bkok->csrmat;
766 
767   ptype   = product->type;
768   switch (ptype) {
769     case MATPRODUCT_AB:  transA = false; transB = false; break;
770     case MATPRODUCT_AtB: transA = true;  transB = false; break;
771     case MATPRODUCT_ABt: transA = false; transB = true;  break;
772     default:
773       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
774   }
775 
776   product->data = pdata = new MatProductData_SeqAIJKokkos();
777   pdata->kh.set_team_work_size(16);
778   pdata->kh.set_dynamic_scheduling(true);
779   pdata->reusesym = product->api_user;
780 
781   /* TODO: add command line options to select spgemm algorithms */
782   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
783   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
784      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
785    */
786   pdata->kh.create_spgemm_handle(spgemm_alg);
787 
788   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
789   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
790   if (transA) {
791     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
792     transA = false;
793   }
794 
795   if (transB) {
796     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
797     transB = false;
798   }
799 
800   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
801   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
802     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
803     calling new Mat_SeqAIJKokkos().
804     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
805   */
806   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
808   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
809 
810   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
811   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
812   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
813   PetscFunctionReturn(0);
814 }
815 
816 /* handles sparse matrix matrix ops */
817 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
818 {
819   PetscErrorCode ierr;
820   Mat_Product    *product = mat->product;
821   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat,1);
825   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
826   if (product->type == MATPRODUCT_ABC) {
827     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
828   }
829   if (Biskok && Ciskok) {
830     switch (product->type) {
831     case MATPRODUCT_AB:
832     case MATPRODUCT_AtB:
833     case MATPRODUCT_ABt:
834       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835       break;
836     case MATPRODUCT_PtAP:
837     case MATPRODUCT_RARt:
838     case MATPRODUCT_ABC:
839       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840       break;
841     default:
842       break;
843     }
844   } else { /* fallback for AIJ */
845     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
846   }
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851 {
852   PetscErrorCode   ierr;
853   Mat_SeqAIJKokkos *aijkok;
854 
855   PetscFunctionBegin;
856   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
857   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
858   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
859   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
860   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
861   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
862   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
867 {
868   PetscErrorCode   ierr;
869   Mat_SeqAIJKokkos *aijkok;
870 
871   PetscFunctionBegin;
872   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
873   if (aijkok) { /* Only zero the device if data is already there */
874     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
875     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
876   } else { /* Might be preallocated but not assembled */
877     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
878   }
879   PetscFunctionReturn(0);
880 }
881 
882 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
883 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
884 {
885   PetscErrorCode     ierr;
886   Mat_SeqAIJKokkos   *aijkok;
887 
888   PetscFunctionBegin;
889   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890   PetscValidPointer(kv,2);
891   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
893   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
894   *kv    = aijkok->a_dual.view_device();
895   PetscFunctionReturn(0);
896 }
897 
898 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
899 {
900   PetscFunctionBegin;
901   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902   PetscValidPointer(kv,2);
903   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904   PetscFunctionReturn(0);
905 }
906 
907 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
908 {
909   PetscErrorCode     ierr;
910   Mat_SeqAIJKokkos   *aijkok;
911 
912   PetscFunctionBegin;
913   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
914   PetscValidPointer(kv,2);
915   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
916   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
917   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
918   *kv    = aijkok->a_dual.view_device();
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
923 {
924   PetscErrorCode     ierr;
925 
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
935 {
936   Mat_SeqAIJKokkos   *aijkok;
937 
938   PetscFunctionBegin;
939   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
940   PetscValidPointer(kv,2);
941   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
942   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
943   *kv    = aijkok->a_dual.view_device();
944   PetscFunctionReturn(0);
945 }
946 
947 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
948 {
949   PetscErrorCode     ierr;
950 
951   PetscFunctionBegin;
952   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
953   PetscValidPointer(kv,2);
954   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
955   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 
959 /* Computes Y += alpha X */
960 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
961 {
962   PetscErrorCode             ierr;
963   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
964   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
965   ConstMatScalarKokkosView   Xa;
966   MatScalarKokkosView        Ya;
967 
968   PetscFunctionBegin;
969   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
970   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
971   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
972   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
973   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
974 
975   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
976     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
977     PetscBool e;
978     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
979     if (e) {
980       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
981       if (e) pattern = SAME_NONZERO_PATTERN;
982     }
983   }
984 
985   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
986     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
987     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
988     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
989   */
990   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
991   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
992   Xa   = xkok->a_dual.view_device();
993   Ya   = ykok->a_dual.view_device();
994 
995   if (pattern == SAME_NONZERO_PATTERN) {
996     KokkosBlas::axpy(alpha,Xa,Ya);
997     ierr = MatSeqAIJKokkosModifyDevice(Y);
998   } else if (pattern == SUBSET_NONZERO_PATTERN) {
999     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1000     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1001 
1002     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1003       PetscInt i = t.league_rank(); /* row i */
1004       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1005         PetscInt p,q = Yi(i);
1006         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1007           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1008           if (Xj(p) == Yj(q)) { /* Found it */
1009             Ya(q) += alpha * Xa(p);
1010             q++;
1011           } else {
1012             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1013                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1014             */
1015             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1016           }
1017         }
1018       });
1019     });
1020     ierr = MatSeqAIJKokkosModifyDevice(Y);
1021   } else { /* different nonzero patterns */
1022     Mat             Z;
1023     KokkosCsrMatrix zcsr;
1024     KernelHandle    kh;
1025     kh.create_spadd_handle(false);
1026     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1027     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1028     zkok = new Mat_SeqAIJKokkos(zcsr);
1029     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1030     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1031     kh.destroy_spadd_handle();
1032   }
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1039 {
1040   PetscErrorCode            ierr;
1041   MPI_Comm                  comm;
1042   PetscInt                  *i,*j,*perm,*jmap;
1043   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
1044   PetscBool                 has_repeats = PETSC_FALSE;
1045   PetscInt                  *Ai,*Aj;
1046   PetscScalar               *Aa;
1047   Mat_SeqAIJKokkos          *akok;
1048   Mat_SeqAIJ                *aseq;
1049   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
1050   Mat                       newmat;
1051 
1052   PetscFunctionBegin;
1053   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1054   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1055   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
1056   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1057   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
1058   perm = perm_h.data();
1059   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
1060     if (j[k] < 0) i[k] = -1;
1061     perm[k] = k;
1062   }
1063 
1064   /* Sort by row */
1065   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
1066   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
1067   nneg = k;
1068   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
1069   nnz  = 0; /* Total number of unique nonzeros to be counted */
1070   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
1071   jmap++; /* Inc jmap by 1 for convinience */
1072 
1073   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
1074   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
1075 
1076   /* In each row, sort by column, then unique column indices to get row length */
1077   Ai++; /* Inc by 1 for convinience */
1078   q = 0; /* q-th unique nonzero, with q starting from 0 */
1079   while (k<n) {
1080     row   = i[k];
1081     start = k; /* [start,end) indices for this row */
1082     while (k<n && i[k] == row) k++;
1083     end   = k;
1084     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
1085     /* Find number of unique col entries in this row */
1086     Aj[q]   = j[start]; /* Log the first nonzero in this row */
1087     jmap[q] = 1; /* Number of repeats of this nozero entry */
1088     Ai[row] = 1;
1089     nnz++;
1090 
1091     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
1092       if (j[p] != j[p-1]) { /* Meet a new nonzero */
1093         q++;
1094         jmap[q] = 1;
1095         Aj[q]   = j[p];
1096         Ai[row]++;
1097         nnz++;
1098       } else {
1099         jmap[q]++;
1100         has_repeats = PETSC_TRUE;
1101       }
1102     }
1103     q++; /* Move to next row and thus next unique nonzero */
1104   }
1105   ierr = PetscFree2(i,j);CHKERRQ(ierr);
1106 
1107   Ai--; /* Back to the beginning of Ai[] */
1108   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
1109   jmap--; /* Back to the beginning of jmap[] */
1110   jmap[0] = 0;
1111   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
1112     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
1113     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
1114   }
1115 
1116   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
1117     PetscInt *Aj_new;
1118     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
1119     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
1120     ierr = PetscFree(Aj);CHKERRQ(ierr);
1121     Aj = Aj_new;
1122   }
1123 
1124   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
1125   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
1126   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
1127   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
1128   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
1129 
1130   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1131   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1132   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1133 
1134   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
1135     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
1136     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
1137     perm_h = newperm_h;
1138   }
1139 
1140   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1141   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1146 {
1147   PetscErrorCode              ierr;
1148   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1149   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1150   PetscInt                    nz = aseq->nz;
1151   const MatRowMapKokkosView&  jmap = akok->jmap_d;
1152   const MatRowMapKokkosView&  perm = akok->perm_d;
1153   MatScalarKokkosView         Aa;
1154   ConstMatScalarKokkosView    kv;
1155   PetscMemType                memtype;
1156 
1157   PetscFunctionBegin;
1158   if (!v) { /* NULL v means an all zero array */
1159     ierr = MatZeroEntries(A);CHKERRQ(ierr);
1160     PetscFunctionReturn(0);
1161   }
1162 
1163   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1164   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1165     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
1166   } else {
1167     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
1168   }
1169 
1170   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1171   if (imode == INSERT_VALUES) {
1172     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1173   }
1174 
1175   if (akok->coo_has_repeats) {
1176     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
1177       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1178     });
1179   } else {
1180     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
1181   }
1182   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1183   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1184   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1189 {
1190   PetscErrorCode ierr;
1191 
1192   PetscFunctionBegin;
1193   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1194   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1195   B->offloadmask = PETSC_OFFLOAD_CPU;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1200 {
1201   PetscErrorCode     ierr;
1202   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1203 
1204   PetscFunctionBegin;
1205   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1206   A->boundtocpu  = PETSC_FALSE;
1207 
1208   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1209   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1210   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1211   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1212   A->ops->scale                     = MatScale_SeqAIJKokkos;
1213   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1214   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1215   A->ops->mult                      = MatMult_SeqAIJKokkos;
1216   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1217   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1218   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1219   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1220   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1221   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1222   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1223   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1224   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1225   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1226   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1227   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1228   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1229   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1230 
1231   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1232   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1233   PetscFunctionReturn(0);
1234 }
1235 
1236 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1237 {
1238   PetscErrorCode ierr;
1239   Mat_SeqAIJ     *aseq;
1240   PetscInt       i,m,n;
1241 
1242   PetscFunctionBegin;
1243   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1244 
1245   m    = akok->nrows();
1246   n    = akok->ncols();
1247   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1248   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1249 
1250   /* Set up data structures of A as a MATSEQAIJ */
1251   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1252   aseq = (Mat_SeqAIJ*)(A)->data;
1253 
1254   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1255   akok->j_dual.sync_host();
1256 
1257   aseq->i            = akok->i_host_data();
1258   aseq->j            = akok->j_host_data();
1259   aseq->a            = akok->a_host_data();
1260   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1261   aseq->singlemalloc = PETSC_FALSE;
1262   aseq->free_a       = PETSC_FALSE;
1263   aseq->free_ij      = PETSC_FALSE;
1264   aseq->nz           = akok->nnz();
1265   aseq->maxnz        = aseq->nz;
1266 
1267   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1268   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1269   for (i=0; i<m; i++) {
1270     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1271   }
1272 
1273   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274   akok->nonzerostate = A->nonzerostate;
1275   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1276   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1277   A->spptr = akok;
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1282 
1283    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284  */
1285 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1286 {
1287   PetscErrorCode ierr;
1288 
1289   PetscFunctionBegin;
1290   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1291   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 /* --------------------------------------------------------------------------------*/
1296 /*@C
1297    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1298    (the default parallel PETSc format). This matrix will ultimately be handled by
1299    Kokkos for calculations. For good matrix
1300    assembly performance the user should preallocate the matrix storage by setting
1301    the parameter nz (or the array nnz).  By setting these parameters accurately,
1302    performance during matrix assembly can be increased by more than a factor of 50.
1303 
1304    Collective
1305 
1306    Input Parameters:
1307 +  comm - MPI communicator, set to PETSC_COMM_SELF
1308 .  m - number of rows
1309 .  n - number of columns
1310 .  nz - number of nonzeros per row (same for all rows)
1311 -  nnz - array containing the number of nonzeros in the various rows
1312          (possibly different for each row) or NULL
1313 
1314    Output Parameter:
1315 .  A - the matrix
1316 
1317    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1318    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1319    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1320 
1321    Notes:
1322    If nnz is given then nz is ignored
1323 
1324    The AIJ format (also called the Yale sparse matrix format or
1325    compressed row storage), is fully compatible with standard Fortran 77
1326    storage.  That is, the stored row and column indices can begin at
1327    either one (as in Fortran) or zero.  See the users' manual for details.
1328 
1329    Specify the preallocated storage with either nz or nnz (not both).
1330    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1331    allocation.  For large problems you MUST preallocate memory or you
1332    will get TERRIBLE performance, see the users' manual chapter on matrices.
1333 
1334    By default, this format uses inodes (identical nodes) when possible, to
1335    improve numerical efficiency of matrix-vector products and solves. We
1336    search for consecutive rows with the same nonzero structure, thereby
1337    reusing matrix information to achieve increased efficiency.
1338 
1339    Level: intermediate
1340 
1341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1342 @*/
1343 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1344 {
1345   PetscErrorCode ierr;
1346 
1347   PetscFunctionBegin;
1348   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1349   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1350   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1351   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1352   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 typedef Kokkos::TeamPolicy<>::member_type team_member;
1357 //
1358 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1359 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1360 //
1361 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1362 {
1363   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1364   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1365   IS                 isrow = b->row,isicol = b->icol;
1366   PetscErrorCode     ierr;
1367   const PetscInt     *r_h,*ic_h;
1368   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1369   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1370   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1371   PetscBool          row_identity,col_identity;
1372   PetscInt           nc, Nf, nVec=32; // should be a parameter
1373   PetscContainer     container;
1374 
1375   PetscFunctionBegin;
1376   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1377   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1378   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1379   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1380   if (container) {
1381     PetscInt *pNf=NULL, nv;
1382     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1383     Nf = (*pNf)%1000;
1384     nv = (*pNf)/1000;
1385     if (nv>0) nVec = nv;
1386   } else Nf = 1;
1387   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
1388   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1389   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1390   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1391   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1392   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1393   {
1394 #define KOKKOS_SHARED_LEVEL 1
1395     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1396     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1397     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1398     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1399     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1400     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1401     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1402     size_t flops_h = 0.0;
1403     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1404     Kokkos::View<size_t> d_flops_k ("flops");
1405     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1406     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1407     Kokkos::deep_copy (d_flops_k, h_flops_k);
1408     Kokkos::deep_copy (d_r_k, h_r_k);
1409     Kokkos::deep_copy (d_ic_k, h_ic_k);
1410     // Fill A --> fact
1411     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1412         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1413         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1414         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1415         // zero rows of B
1416         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1417             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1418             PetscScalar *baL = ba_d + bi_d[rowb];
1419             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1420             /* zero (unfactored row) */
1421             for (int j=0;j<nzbL;j++) baL[j] = 0;
1422             for (int j=0;j<nzbU;j++) baU[j] = 0;
1423           });
1424         // copy A into B
1425         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1426             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1427             const PetscScalar *av    = aa_d + ai_d[rowa];
1428             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1429             /* load in initial (unfactored row) */
1430             for (int j=0;j<nza;j++) {
1431               PetscInt    colb = ic[ajtmp[j]];
1432               PetscScalar vala = av[j];
1433               if (colb == rowb) {
1434                 *(ba_d + bdiag_d[rowb]) = vala;
1435               } else {
1436                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1437                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1438                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1439                 for (int j=0; j<nz ; j++) {
1440                   if (pbj[j] == colb) {
1441                     pba[j] = vala;
1442                     set++;
1443                     break;
1444                   }
1445                 }
1446                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1447               }
1448             }
1449           });
1450       });
1451     Kokkos::fence();
1452 
1453     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1454         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1455         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1456         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1457         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1458         const PetscInt  start = field*nloc, end = start + nloc;
1459         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1460         // A22 panel update for each row A(1,:) and col A(:,1)
1461         for (int ii=start; ii<end-1; ii++) {
1462           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1463           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1464           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1465           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1466           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1467               PetscInt kIdx = j*Ni + field_block_idx;
1468               if (kIdx >= nzUi) /* void */ ;
1469               else {
1470                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1471                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1472                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1473                 size_t         st_idx;
1474                 // find and do L(k,i) = A(:k,i) / A(i,i)
1475                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1476                 // get column, there has got to be a better way
1477                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1478                     if (pjL[j] == ii) {
1479                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1480                       idx = j; // output
1481                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1482                     }
1483                 }, st_idx);
1484                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1485 #if defined(PETSC_USE_DEBUG)
1486                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1487 #endif
1488                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1489                 // U(i+1,:end)
1490                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1491                       PetscScalar Uij = baUi[uiIdx];
1492                       PetscInt    col = bjUi[uiIdx];
1493                       if (col==myk) {
1494                         // A_kk = A_kk - L_ki * U_ij(k)
1495                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1496                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1497                       } else {
1498                         PetscScalar    *start, *end, *pAkjv=NULL;
1499                         PetscInt       high, low;
1500                         const PetscInt *startj;
1501                         if (col<myk) { // L
1502                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1503                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1504                           start = pLki+1; // start at pLki+1, A22(myk,1)
1505                           startj= bj_d + bi_d[myk] + idx;
1506                           end   = ba_d + bi_d[myk+1];
1507                         } else {
1508                           PetscInt idx = bdiag_d[myk+1]+1;
1509                           start = ba_d + idx;
1510                           startj= bj_d + idx;
1511                           end   = ba_d + bdiag_d[myk];
1512                         }
1513                         // search for 'col', use bisection search - TODO
1514                         low  = 0;
1515                         high = (PetscInt)(end-start);
1516                         while (high-low > 5) {
1517                           int t = (low+high)/2;
1518                           if (startj[t] > col) high = t;
1519                           else                 low  = t;
1520                         }
1521                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1522                           if (startj[pAkjv-start] == col) break;
1523                         }
1524 #if defined(PETSC_USE_DEBUG)
1525                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1526 #endif
1527                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1528                       }
1529                     });
1530               }
1531             });
1532           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1533           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1534         } /* endof for (i=0; i<n; i++) { */
1535         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1536       });
1537     Kokkos::fence();
1538     Kokkos::deep_copy (h_flops_k, d_flops_k);
1539     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1540     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1541         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1542         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1543         /* Invert diagonal for simpler triangular solves */
1544         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1545             int i = start + outer_index*Ni + lg_rank%Ni;
1546             if (i < end) {
1547               PetscScalar *pv = ba_d + bdiag_d[i];
1548               *pv = 1.0/(*pv);
1549             }
1550           });
1551       });
1552   }
1553   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1554   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1555   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1556 
1557   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1558   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1559   if (b->inode.size) {
1560     B->ops->solve = MatSolve_SeqAIJ_Inode;
1561   } else if (row_identity && col_identity) {
1562     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1563   } else {
1564     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1565   }
1566   B->offloadmask = PETSC_OFFLOAD_GPU;
1567   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1568   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1569   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1570   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1571   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1572   B->assembled              = PETSC_TRUE;
1573   B->preallocated           = PETSC_TRUE;
1574 
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1579 {
1580   PetscErrorCode   ierr;
1581 
1582   PetscFunctionBegin;
1583   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1584   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1589 {
1590   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1591 
1592   PetscFunctionBegin;
1593   if (!factors->sptrsv_symbolic_completed) {
1594     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1595     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1596     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1597   }
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 /* Check if we need to update factors etc for transpose solve */
1602 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1603 {
1604   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1605   MatColIdxType             n = A->rmap->n;
1606 
1607   PetscFunctionBegin;
1608   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1609     /* Update L^T and do sptrsv symbolic */
1610     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1611     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1612     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1613     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1614 
1615     KokkosKernels::Impl::transpose_matrix<
1616       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1617       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1618       MatRowMapKokkosView,DefaultExecutionSpace>(
1619         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1620         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1621 
1622     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1623       We have to sort the indices, until KK provides finer control options.
1624     */
1625     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1626       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1627         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1628 
1629     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1630 
1631     /* Update U^T and do sptrsv symbolic */
1632     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1633     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1634     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1635     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1636 
1637     KokkosKernels::Impl::transpose_matrix<
1638       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1639       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1640       MatRowMapKokkosView,DefaultExecutionSpace>(
1641         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1642         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1643 
1644     /* Sort indices. See comments above */
1645     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1646       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1647         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1648 
1649     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1650     factors->transpose_updated = PETSC_TRUE;
1651   }
1652   PetscFunctionReturn(0);
1653 }
1654 
1655 /* Solve Ax = b, with A = LU */
1656 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1657 {
1658   PetscErrorCode                 ierr;
1659   ConstPetscScalarKokkosView     bv;
1660   PetscScalarKokkosView          xv;
1661   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1662 
1663   PetscFunctionBegin;
1664   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1665   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1666   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1667   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1668   /* Solve L tmpv = b */
1669   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1670   /* Solve Ux = tmpv */
1671   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1672   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1673   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1674   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1675   PetscFunctionReturn(0);
1676 }
1677 
1678 /* Solve A^T x = b, where A^T = U^T L^T */
1679 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1680 {
1681   PetscErrorCode                 ierr;
1682   ConstPetscScalarKokkosView     bv;
1683   PetscScalarKokkosView          xv;
1684   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1685 
1686   PetscFunctionBegin;
1687   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1688   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1689   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1690   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1691   /* Solve U^T tmpv = b */
1692   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1693 
1694   /* Solve L^T x = tmpv */
1695   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1696   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1697   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1698   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1703 {
1704   PetscErrorCode                 ierr;
1705   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1706   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1707   PetscInt                       fill_lev = info->levels;
1708 
1709   PetscFunctionBegin;
1710   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1711   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1712 
1713   auto a_d = aijkok->a_dual.view_device();
1714   auto i_d = aijkok->i_dual.view_device();
1715   auto j_d = aijkok->j_dual.view_device();
1716 
1717   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1718 
1719   B->assembled                       = PETSC_TRUE;
1720   B->preallocated                    = PETSC_TRUE;
1721   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1722   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1723   B->ops->matsolve                   = NULL;
1724   B->ops->matsolvetranspose          = NULL;
1725   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1726 
1727   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1728   factors->transpose_updated         = PETSC_FALSE;
1729   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1730   /* TODO: log flops, but how to know that? */
1731   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1736 {
1737   PetscErrorCode                 ierr;
1738   Mat_SeqAIJKokkos               *aijkok;
1739   Mat_SeqAIJ                     *b;
1740   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1741   PetscInt                       fill_lev = info->levels;
1742   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1743   PetscInt                       n = A->rmap->n;
1744 
1745   PetscFunctionBegin;
1746   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1747   /* Rebuild factors */
1748   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1749   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1750 
1751   /* Create a spiluk handle and then do symbolic factorization */
1752   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1753   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1754 
1755   auto spiluk_handle = factors->kh.get_spiluk_handle();
1756 
1757   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1758   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1759   Kokkos::realloc(factors->iU_d,n+1);
1760   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1761 
1762   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1763   auto i_d = aijkok->i_dual.view_device();
1764   auto j_d = aijkok->j_dual.view_device();
1765   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1766   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1767 
1768   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1769   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1770   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1771   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1772 
1773   /* TODO: add options to select sptrsv algorithms */
1774   /* Create sptrsv handles for L, U and their transpose */
1775  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1776   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1777  #else
1778   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1779  #endif
1780 
1781   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1782   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1783   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1784   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1785 
1786   /* Fill fields of the factor matrix B */
1787   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1788   b     = (Mat_SeqAIJ*)B->data;
1789   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1790   B->info.fill_ratio_given  = info->fill;
1791   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1792 
1793   B->offloadmask            = PETSC_OFFLOAD_GPU;
1794   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1795   PetscFunctionReturn(0);
1796 }
1797 
1798 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1799 {
1800   PetscErrorCode   ierr;
1801   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1802   const PetscInt   nrows   = A->rmap->n;
1803 
1804   PetscFunctionBegin;
1805   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1806   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1807   // move B data into Kokkos
1808   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1809   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1810   {
1811     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1812     if (!baijkok->diag_d) {
1813       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1814       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1815       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1816     }
1817   }
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1822 {
1823   PetscFunctionBegin;
1824   *type = MATSOLVERKOKKOS;
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1829 {
1830   PetscFunctionBegin;
1831   *type = MATSOLVERKOKKOSDEVICE;
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 /*MC
1836   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1837   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1838 
1839   Level: beginner
1840 
1841 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1842 M*/
1843 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1844 {
1845   PetscErrorCode ierr;
1846   PetscInt       n = A->rmap->n;
1847 
1848   PetscFunctionBegin;
1849   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1850   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1851   (*B)->factortype = ftype;
1852   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1853   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1854 
1855   if (ftype == MAT_FACTOR_LU) {
1856     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1857     (*B)->canuseordering         = PETSC_TRUE;
1858     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1859   } else if (ftype == MAT_FACTOR_ILU) {
1860     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1861     (*B)->canuseordering         = PETSC_FALSE;
1862     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1863   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1864 
1865   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1866   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1867   PetscFunctionReturn(0);
1868 }
1869 
1870 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1871 {
1872   PetscErrorCode ierr;
1873   PetscInt       n = A->rmap->n;
1874 
1875   PetscFunctionBegin;
1876   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1877   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1878   (*B)->factortype = ftype;
1879   (*B)->canuseordering = PETSC_TRUE;
1880   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1881   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1882 
1883   if (ftype == MAT_FACTOR_LU) {
1884     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1885     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1886   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1887 
1888   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1889   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1894 {
1895   PetscErrorCode ierr;
1896 
1897   PetscFunctionBegin;
1898   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1899   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1900   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 /* Utility to print out a KokkosCsrMatrix for debugging */
1905 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1906 {
1907   PetscErrorCode    ierr;
1908   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1909   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1910   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1911   const PetscInt    *i = iv.data();
1912   const PetscInt    *j = jv.data();
1913   const PetscScalar *a = av.data();
1914   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1915 
1916   PetscFunctionBegin;
1917   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1918   for (PetscInt k=0; k<m; k++) {
1919     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1920     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1921       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1922     }
1923     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1924   }
1925   PetscFunctionReturn(0);
1926 }
1927