xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 0a10e71bfc537e0d2d5d8bf3f77c9e85c2a711aa)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
252   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
255   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   ierr   = PetscLogGpuTimeEnd();CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /* y = A^T x */
266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
267 {
268   PetscErrorCode                   ierr;
269   Mat_SeqAIJKokkos                 *aijkok;
270   const char                       *mode;
271   ConstPetscScalarKokkosView       xv;
272   PetscScalarKokkosView            yv;
273   KokkosCsrMatrix                  *csrmat;
274 
275   PetscFunctionBegin;
276   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
277   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
280   if (A->form_explicit_transpose) {
281     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282     mode = "N";
283   } else {
284     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285     csrmat = &aijkok->csrmat;
286     mode = "T";
287   }
288   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
291   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /* y = A^H x */
297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
298 {
299   PetscErrorCode                   ierr;
300   Mat_SeqAIJKokkos                 *aijkok;
301   const char                       *mode;
302   ConstPetscScalarKokkosView       xv;
303   PetscScalarKokkosView            yv;
304   KokkosCsrMatrix                  *csrmat;
305 
306   PetscFunctionBegin;
307   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
308   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
311   if (A->form_explicit_transpose) {
312     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313     mode = "N";
314   } else {
315     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316     csrmat = &aijkok->csrmat;
317     mode = "C";
318   }
319   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
322   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 /* z = A x + y */
328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
329 {
330   PetscErrorCode                   ierr;
331   Mat_SeqAIJKokkos                 *aijkok;
332   ConstPetscScalarKokkosView       xv,yv;
333   PetscScalarKokkosView            zv;
334 
335   PetscFunctionBegin;
336   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
337   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
341   if (zz != yy) Kokkos::deep_copy(zv,yv);
342   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 /* z = A^T x + y */
353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
354 {
355   PetscErrorCode                   ierr;
356   Mat_SeqAIJKokkos                 *aijkok;
357   const char                       *mode;
358   ConstPetscScalarKokkosView       xv,yv;
359   PetscScalarKokkosView            zv;
360   KokkosCsrMatrix                  *csrmat;
361 
362   PetscFunctionBegin;
363   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
364   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
368   if (zz != yy) Kokkos::deep_copy(zv,yv);
369   if (A->form_explicit_transpose) {
370     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374     csrmat = &aijkok->csrmat;
375     mode = "T";
376   }
377   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
381   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /* z = A^H x + y */
387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   PetscErrorCode                   ierr;
390   Mat_SeqAIJKokkos                 *aijkok;
391   const char                       *mode;
392   ConstPetscScalarKokkosView       xv,yv;
393   PetscScalarKokkosView            zv;
394   KokkosCsrMatrix                  *csrmat;
395 
396   PetscFunctionBegin;
397   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
398   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
402   if (zz != yy) Kokkos::deep_copy(zv,yv);
403   if (A->form_explicit_transpose) {
404     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode = "C";
410   }
411   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
415   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421 {
422   PetscErrorCode            ierr;
423   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424 
425   PetscFunctionBegin;
426   switch (op) {
427     case MAT_FORM_EXPLICIT_TRANSPOSE:
428       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430       A->form_explicit_transpose = flg;
431       break;
432     default:
433       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434       break;
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Depending on reuse, either build a new mat, or use the existing mat */
440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
441 {
442   PetscErrorCode   ierr;
443   Mat_SeqAIJ       *aseq;
444 
445   PetscFunctionBegin;
446   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
449   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
459       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
461     }
462   }
463   PetscFunctionReturn(0);
464 }
465 
466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468  */
469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
470 {
471   PetscErrorCode        ierr;
472   Mat_SeqAIJ            *bseq;
473   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474   Mat                   mat;
475 
476   PetscFunctionBegin;
477   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479   mat  = *B;
480   if (A->assembled) {
481     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484     /* Now copy values to B if needed */
485     if (dupOption == MAT_COPY_VALUES) {
486       if (akok->a_dual.need_sync_device()) {
487         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488         bkok->a_dual.modify_host();
489       } else { /* If device has the latest data, we only copy data on device */
490         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491         bkok->a_dual.modify_device();
492       }
493     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495       bkok->a_dual.modify_host();
496     }
497     mat->spptr = bkok;
498   }
499 
500   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
508 {
509   PetscErrorCode    ierr;
510   Mat               At;
511   KokkosCsrMatrix   *internT;
512   Mat_SeqAIJKokkos  *atkok,*bkok;
513 
514   PetscFunctionBegin;
515   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
516   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517     /* Deep copy internT, as we want to isolate the internal transpose */
518     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
519     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
520     if (reuse == MAT_INITIAL_MATRIX) *B = At;
521     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
522   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
523     if ((*B)->assembled) {
524       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
525       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
526       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
527     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
528       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
529       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
530       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
531       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
532       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
533     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
539 {
540   PetscErrorCode             ierr;
541   Mat_SeqAIJKokkos           *aijkok;
542 
543   PetscFunctionBegin;
544   if (A->factortype == MAT_FACTOR_NONE) {
545     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546     delete aijkok;
547   } else {
548     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
549   }
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
552   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
553   A->spptr = NULL;
554   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 /*MC
559    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
560 
561    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
562 
563    Options Database Keys:
564 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
565 
566   Level: beginner
567 
568 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
569 M*/
570 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
571 {
572   PetscErrorCode ierr;
573 
574   PetscFunctionBegin;
575   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
576   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
577   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
578   PetscFunctionReturn(0);
579 }
580 
581 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
582 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
583 {
584   PetscErrorCode               ierr;
585   Mat_SeqAIJ                   *a,*b;
586   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
587   MatScalarKokkosView          aa,ba,ca;
588   MatRowMapKokkosView          ai,bi,ci;
589   MatColIdxKokkosView          aj,bj,cj;
590   PetscInt                     m,n,nnz,aN;
591 
592   PetscFunctionBegin;
593   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
594   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
595   PetscValidPointer(C,4);
596   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
597   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
598   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
599   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
600 
601   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
602   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
603   a    = static_cast<Mat_SeqAIJ*>(A->data);
604   b    = static_cast<Mat_SeqAIJ*>(B->data);
605   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
606   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
607   aa   = akok->a_dual.view_device();
608   ai   = akok->i_dual.view_device();
609   ba   = bkok->a_dual.view_device();
610   bi   = bkok->i_dual.view_device();
611   m    = A->rmap->n; /* M, N and nnz of C */
612   n    = A->cmap->n + B->cmap->n;
613   nnz  = a->nz + b->nz;
614   aN   = A->cmap->n; /* N of A */
615   if (reuse == MAT_INITIAL_MATRIX) {
616     aj = akok->j_dual.view_device();
617     bj = bkok->j_dual.view_device();
618     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
619     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
620     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
621     ca = ca_dual.view_device();
622     ci = ci_dual.view_device();
623     cj = cj_dual.view_device();
624 
625     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
626     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
627       PetscInt i = t.league_rank(); /* row i */
628       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
629 
630       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
631         ci(i) = coffset;
632         if (i == m-1) ci(m) = ai(m) + bi(m);
633       });
634 
635       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
636         if (k < alen) {
637           ca(coffset+k) = aa(ai(i)+k);
638           cj(coffset+k) = aj(ai(i)+k);
639         } else {
640           ca(coffset+k) = ba(bi(i)+k-alen);
641           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
642         }
643       });
644     });
645     ca_dual.modify_device();
646     ci_dual.modify_device();
647     cj_dual.modify_device();
648     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
649     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
650   } else if (reuse == MAT_REUSE_MATRIX) {
651     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
652     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
653     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
654     ca   = ckok->a_dual.view_device();
655     ci   = ckok->i_dual.view_device();
656 
657     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
658       PetscInt i = t.league_rank(); /* row i */
659       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
660       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
661         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
662         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
663       });
664     });
665     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
666   }
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
671 {
672   PetscFunctionBegin;
673   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
674   PetscFunctionReturn(0);
675 }
676 
677 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
678 {
679   PetscErrorCode                 ierr;
680   Mat_Product                    *product = C->product;
681   Mat                            A,B;
682   bool                           transA,transB; /* use bool, since KK needs this type */
683   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
684   Mat_SeqAIJ                     *c;
685   MatProductData_SeqAIJKokkos    *pdata;
686   KokkosCsrMatrix                *csrmatA,*csrmatB;
687 
688   PetscFunctionBegin;
689   MatCheckProduct(C,1);
690   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
691   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
692 
693   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
694     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
695     PetscFunctionReturn(0);
696   }
697 
698   switch (product->type) {
699     case MATPRODUCT_AB:  transA = false; transB = false; break;
700     case MATPRODUCT_AtB: transA = true;  transB = false; break;
701     case MATPRODUCT_ABt: transA = false; transB = true;  break;
702     default:
703       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
704   }
705 
706   A     = product->A;
707   B     = product->B;
708   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
709   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
710   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
711   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
712   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
713 
714   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
715 
716   csrmatA = &akok->csrmat;
717   csrmatB = &bkok->csrmat;
718 
719   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
720   if (transA) {
721     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
722     transA = false;
723   }
724 
725   if (transB) {
726     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
727     transB = false;
728   }
729   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
730   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
731   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
732   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
733   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
734   /* shorter version of MatAssemblyEnd_SeqAIJ */
735   c = (Mat_SeqAIJ*)C->data;
736   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
737   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
738   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
739   c->reallocs         = 0;
740   C->info.mallocs     = 0;
741   C->info.nz_unneeded = 0;
742   C->assembled        = C->was_assembled = PETSC_TRUE;
743   C->num_ass++;
744   PetscFunctionReturn(0);
745 }
746 
747 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
748 {
749   PetscErrorCode                 ierr;
750   Mat_Product                    *product = C->product;
751   MatProductType                 ptype;
752   Mat                            A,B;
753   bool                           transA,transB;
754   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
755   MatProductData_SeqAIJKokkos    *pdata;
756   MPI_Comm                       comm;
757   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
758 
759   PetscFunctionBegin;
760   MatCheckProduct(C,1);
761   ierr = PetscObjectGetComm((PetscObject)C,&comm);
762   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
763   A       = product->A;
764   B       = product->B;
765   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
766   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
767   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
768   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
769   csrmatA = &akok->csrmat;
770   csrmatB = &bkok->csrmat;
771 
772   ptype   = product->type;
773   switch (ptype) {
774     case MATPRODUCT_AB:  transA = false; transB = false; break;
775     case MATPRODUCT_AtB: transA = true;  transB = false; break;
776     case MATPRODUCT_ABt: transA = false; transB = true;  break;
777     default:
778       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
779   }
780 
781   product->data = pdata = new MatProductData_SeqAIJKokkos();
782   pdata->kh.set_team_work_size(16);
783   pdata->kh.set_dynamic_scheduling(true);
784   pdata->reusesym = product->api_user;
785 
786   /* TODO: add command line options to select spgemm algorithms */
787   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
788   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
789      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
790    */
791   pdata->kh.create_spgemm_handle(spgemm_alg);
792 
793   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
794   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
795   if (transA) {
796     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
797     transA = false;
798   }
799 
800   if (transB) {
801     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
802     transB = false;
803   }
804 
805   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
806   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
807     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
808     calling new Mat_SeqAIJKokkos().
809     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
810   */
811   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
812   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
813   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
814 
815   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
816   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
817   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
818   PetscFunctionReturn(0);
819 }
820 
821 /* handles sparse matrix matrix ops */
822 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
823 {
824   PetscErrorCode ierr;
825   Mat_Product    *product = mat->product;
826   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827 
828   PetscFunctionBegin;
829   MatCheckProduct(mat,1);
830   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
831   if (product->type == MATPRODUCT_ABC) {
832     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
833   }
834   if (Biskok && Ciskok) {
835     switch (product->type) {
836     case MATPRODUCT_AB:
837     case MATPRODUCT_AtB:
838     case MATPRODUCT_ABt:
839       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840       break;
841     case MATPRODUCT_PtAP:
842     case MATPRODUCT_RARt:
843     case MATPRODUCT_ABC:
844       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845       break;
846     default:
847       break;
848     }
849   } else { /* fallback for AIJ */
850     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856 {
857   PetscErrorCode   ierr;
858   Mat_SeqAIJKokkos *aijkok;
859 
860   PetscFunctionBegin;
861   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
862   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
863   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
864   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
865   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
866   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
867   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
872 {
873   PetscErrorCode   ierr;
874   Mat_SeqAIJKokkos *aijkok;
875 
876   PetscFunctionBegin;
877   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
878   if (aijkok) { /* Only zero the device if data is already there */
879     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
880     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
881   } else { /* Might be preallocated but not assembled */
882     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
883   }
884   PetscFunctionReturn(0);
885 }
886 
887 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
888 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
889 {
890   PetscErrorCode     ierr;
891   Mat_SeqAIJKokkos   *aijkok;
892 
893   PetscFunctionBegin;
894   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
895   PetscValidPointer(kv,2);
896   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
897   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
898   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
899   *kv    = aijkok->a_dual.view_device();
900   PetscFunctionReturn(0);
901 }
902 
903 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
904 {
905   PetscFunctionBegin;
906   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
907   PetscValidPointer(kv,2);
908   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
909   PetscFunctionReturn(0);
910 }
911 
912 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
913 {
914   PetscErrorCode     ierr;
915   Mat_SeqAIJKokkos   *aijkok;
916 
917   PetscFunctionBegin;
918   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
919   PetscValidPointer(kv,2);
920   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
921   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
922   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
923   *kv    = aijkok->a_dual.view_device();
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
928 {
929   PetscErrorCode     ierr;
930 
931   PetscFunctionBegin;
932   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
933   PetscValidPointer(kv,2);
934   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
935   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
936   PetscFunctionReturn(0);
937 }
938 
939 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
940 {
941   Mat_SeqAIJKokkos   *aijkok;
942 
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
948   *kv    = aijkok->a_dual.view_device();
949   PetscFunctionReturn(0);
950 }
951 
952 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
953 {
954   PetscErrorCode     ierr;
955 
956   PetscFunctionBegin;
957   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
958   PetscValidPointer(kv,2);
959   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
960   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
961   PetscFunctionReturn(0);
962 }
963 
964 /* Computes Y += alpha X */
965 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
966 {
967   PetscErrorCode             ierr;
968   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
969   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
970   ConstMatScalarKokkosView   Xa;
971   MatScalarKokkosView        Ya;
972 
973   PetscFunctionBegin;
974   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
975   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
976   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
977   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
978   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
979 
980   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
981     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
982     PetscBool e;
983     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
984     if (e) {
985       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
986       if (e) pattern = SAME_NONZERO_PATTERN;
987     }
988   }
989 
990   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
991     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
992     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
993     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
994   */
995   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
996   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
997   Xa   = xkok->a_dual.view_device();
998   Ya   = ykok->a_dual.view_device();
999 
1000   if (pattern == SAME_NONZERO_PATTERN) {
1001     KokkosBlas::axpy(alpha,Xa,Ya);
1002     ierr = MatSeqAIJKokkosModifyDevice(Y);
1003   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1004     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1005     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1006 
1007     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1008       PetscInt i = t.league_rank(); /* row i */
1009       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1010         PetscInt p,q = Yi(i);
1011         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1012           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1013           if (Xj(p) == Yj(q)) { /* Found it */
1014             Ya(q) += alpha * Xa(p);
1015             q++;
1016           } else {
1017             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1018                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1019             */
1020             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1021           }
1022         }
1023       });
1024     });
1025     ierr = MatSeqAIJKokkosModifyDevice(Y);
1026   } else { /* different nonzero patterns */
1027     Mat             Z;
1028     KokkosCsrMatrix zcsr;
1029     KernelHandle    kh;
1030     kh.create_spadd_handle(false);
1031     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1032     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1033     zkok = new Mat_SeqAIJKokkos(zcsr);
1034     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1035     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1036     kh.destroy_spadd_handle();
1037   }
1038   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1039   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1044 {
1045   PetscErrorCode            ierr;
1046   Mat                       newmat;
1047   Mat_SeqAIJKokkos          *akok;
1048   Mat_SeqAIJ                *aseq;
1049 
1050   PetscFunctionBegin;
1051   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1052   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1053   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1054   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1055   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1056   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1057   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1058   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1059   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1060   akok->SetUpCOO(aseq);
1061   PetscFunctionReturn(0);
1062 }
1063 
1064 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1065 {
1066   PetscErrorCode              ierr;
1067   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1068   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1069   PetscCount                  Annz = aseq->nz;
1070   const PetscCountKokkosView& jmap = akok->jmap_d;
1071   const PetscCountKokkosView& perm = akok->perm_d;
1072   MatScalarKokkosView         Aa;
1073   ConstMatScalarKokkosView    kv;
1074   PetscMemType                memtype;
1075 
1076   PetscFunctionBegin;
1077   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
1078   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1079   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1080     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1081   } else {
1082     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1083   }
1084 
1085   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1086   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
1087 
1088   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1089     PetscScalar sum = 0.0;
1090     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1091     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1092   });
1093 
1094   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1095   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
1096   PetscFunctionReturn(0);
1097 }
1098 
1099 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1100 {
1101   PetscErrorCode ierr;
1102 
1103   PetscFunctionBegin;
1104   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1105   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1106   B->offloadmask = PETSC_OFFLOAD_CPU;
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1111 {
1112   PetscErrorCode     ierr;
1113   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1114 
1115   PetscFunctionBegin;
1116   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1117   A->boundtocpu  = PETSC_FALSE;
1118 
1119   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1120   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1121   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1122   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1123   A->ops->scale                     = MatScale_SeqAIJKokkos;
1124   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1125   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1126   A->ops->mult                      = MatMult_SeqAIJKokkos;
1127   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1128   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1129   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1130   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1131   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1132   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1133   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1134   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1135   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1136   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1137   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1138   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1139   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1140   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1141 
1142   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1143   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1144   PetscFunctionReturn(0);
1145 }
1146 
1147 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1148 {
1149   PetscErrorCode ierr;
1150   Mat_SeqAIJ     *aseq;
1151   PetscInt       i,m,n;
1152 
1153   PetscFunctionBegin;
1154   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1155 
1156   m    = akok->nrows();
1157   n    = akok->ncols();
1158   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1159   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1160 
1161   /* Set up data structures of A as a MATSEQAIJ */
1162   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1163   aseq = (Mat_SeqAIJ*)(A)->data;
1164 
1165   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1166   akok->j_dual.sync_host();
1167 
1168   aseq->i            = akok->i_host_data();
1169   aseq->j            = akok->j_host_data();
1170   aseq->a            = akok->a_host_data();
1171   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1172   aseq->singlemalloc = PETSC_FALSE;
1173   aseq->free_a       = PETSC_FALSE;
1174   aseq->free_ij      = PETSC_FALSE;
1175   aseq->nz           = akok->nnz();
1176   aseq->maxnz        = aseq->nz;
1177 
1178   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1179   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1180   for (i=0; i<m; i++) {
1181     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1182   }
1183 
1184   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1185   akok->nonzerostate = A->nonzerostate;
1186   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1187   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1188   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1193 
1194    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1195  */
1196 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1197 {
1198   PetscErrorCode ierr;
1199 
1200   PetscFunctionBegin;
1201   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1202   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 /* --------------------------------------------------------------------------------*/
1207 /*@C
1208    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1209    (the default parallel PETSc format). This matrix will ultimately be handled by
1210    Kokkos for calculations. For good matrix
1211    assembly performance the user should preallocate the matrix storage by setting
1212    the parameter nz (or the array nnz).  By setting these parameters accurately,
1213    performance during matrix assembly can be increased by more than a factor of 50.
1214 
1215    Collective
1216 
1217    Input Parameters:
1218 +  comm - MPI communicator, set to PETSC_COMM_SELF
1219 .  m - number of rows
1220 .  n - number of columns
1221 .  nz - number of nonzeros per row (same for all rows)
1222 -  nnz - array containing the number of nonzeros in the various rows
1223          (possibly different for each row) or NULL
1224 
1225    Output Parameter:
1226 .  A - the matrix
1227 
1228    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1229    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1230    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1231 
1232    Notes:
1233    If nnz is given then nz is ignored
1234 
1235    The AIJ format (also called the Yale sparse matrix format or
1236    compressed row storage), is fully compatible with standard Fortran 77
1237    storage.  That is, the stored row and column indices can begin at
1238    either one (as in Fortran) or zero.  See the users' manual for details.
1239 
1240    Specify the preallocated storage with either nz or nnz (not both).
1241    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1242    allocation.  For large problems you MUST preallocate memory or you
1243    will get TERRIBLE performance, see the users' manual chapter on matrices.
1244 
1245    By default, this format uses inodes (identical nodes) when possible, to
1246    improve numerical efficiency of matrix-vector products and solves. We
1247    search for consecutive rows with the same nonzero structure, thereby
1248    reusing matrix information to achieve increased efficiency.
1249 
1250    Level: intermediate
1251 
1252 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1253 @*/
1254 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1255 {
1256   PetscErrorCode ierr;
1257 
1258   PetscFunctionBegin;
1259   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1260   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1261   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1262   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1263   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 typedef Kokkos::TeamPolicy<>::member_type team_member;
1268 //
1269 // This factorization exploits block diagonal matrices with "Nf" (not used).
1270 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1271 //
1272 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1273 {
1274   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1275   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1276   IS                 isrow = b->row,isicol = b->icol;
1277   PetscErrorCode     ierr;
1278   const PetscInt     *r_h,*ic_h;
1279   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1280   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1281   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1282   PetscBool          row_identity,col_identity;
1283   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1284 
1285   PetscFunctionBegin;
1286   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1287   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1288   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1289   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1290   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1291   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1292   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1293   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1294   {
1295 #define KOKKOS_SHARED_LEVEL 1
1296     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1297     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1298     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1299     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1300     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1301     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1302     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1303     size_t flops_h = 0.0;
1304     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1305     Kokkos::View<size_t> d_flops_k ("flops");
1306     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1307     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1308     Kokkos::deep_copy (d_flops_k, h_flops_k);
1309     Kokkos::deep_copy (d_r_k, h_r_k);
1310     Kokkos::deep_copy (d_ic_k, h_ic_k);
1311     // Fill A --> fact
1312     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1313         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1314         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1315         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1316         // zero rows of B
1317         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1318             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1319             PetscScalar *baL = ba_d + bi_d[rowb];
1320             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1321             /* zero (unfactored row) */
1322             for (int j=0;j<nzbL;j++) baL[j] = 0;
1323             for (int j=0;j<nzbU;j++) baU[j] = 0;
1324           });
1325         // copy A into B
1326         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1327             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1328             const PetscScalar *av    = aa_d + ai_d[rowa];
1329             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1330             /* load in initial (unfactored row) */
1331             for (int j=0;j<nza;j++) {
1332               PetscInt    colb = ic[ajtmp[j]];
1333               PetscScalar vala = av[j];
1334               if (colb == rowb) {
1335                 *(ba_d + bdiag_d[rowb]) = vala;
1336               } else {
1337                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1338                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1339                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1340                 for (int j=0; j<nz ; j++) {
1341                   if (pbj[j] == colb) {
1342                     pba[j] = vala;
1343                     set++;
1344                     break;
1345                   }
1346                 }
1347                #if !defined(PETSC_HAVE_SYCL)
1348                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1349                #endif
1350               }
1351             }
1352           });
1353       });
1354     Kokkos::fence();
1355 
1356     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1357         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1358         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1359         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1360         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1361         const PetscInt  start = field*nloc, end = start + nloc;
1362         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1363         // A22 panel update for each row A(1,:) and col A(:,1)
1364         for (int ii=start; ii<end-1; ii++) {
1365           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1366           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1367           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1368           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1369           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1370               PetscInt kIdx = j*Ni + field_block_idx;
1371               if (kIdx >= nzUi) /* void */ ;
1372               else {
1373                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1374                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1375                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1376                 size_t         st_idx;
1377                 // find and do L(k,i) = A(:k,i) / A(i,i)
1378                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1379                 // get column, there has got to be a better way
1380                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1381                     if (pjL[j] == ii) {
1382                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1383                       idx = j; // output
1384                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1385                     }
1386                 }, st_idx);
1387                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1388 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1389                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1390 #endif
1391                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1392                 // U(i+1,:end)
1393                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1394                       PetscScalar Uij = baUi[uiIdx];
1395                       PetscInt    col = bjUi[uiIdx];
1396                       if (col==myk) {
1397                         // A_kk = A_kk - L_ki * U_ij(k)
1398                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1399                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1400                       } else {
1401                         PetscScalar    *start, *end, *pAkjv=NULL;
1402                         PetscInt       high, low;
1403                         const PetscInt *startj;
1404                         if (col<myk) { // L
1405                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1406                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1407                           start = pLki+1; // start at pLki+1, A22(myk,1)
1408                           startj= bj_d + bi_d[myk] + idx;
1409                           end   = ba_d + bi_d[myk+1];
1410                         } else {
1411                           PetscInt idx = bdiag_d[myk+1]+1;
1412                           start = ba_d + idx;
1413                           startj= bj_d + idx;
1414                           end   = ba_d + bdiag_d[myk];
1415                         }
1416                         // search for 'col', use bisection search - TODO
1417                         low  = 0;
1418                         high = (PetscInt)(end-start);
1419                         while (high-low > 5) {
1420                           int t = (low+high)/2;
1421                           if (startj[t] > col) high = t;
1422                           else                 low  = t;
1423                         }
1424                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1425                           if (startj[pAkjv-start] == col) break;
1426                         }
1427 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1428                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1429 #endif
1430                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1431                       }
1432                     });
1433               }
1434             });
1435           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1436           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1437         } /* endof for (i=0; i<n; i++) { */
1438         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1439       });
1440     Kokkos::fence();
1441     Kokkos::deep_copy (h_flops_k, d_flops_k);
1442     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1443     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1444         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1445         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1446         /* Invert diagonal for simpler triangular solves */
1447         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1448             int i = start + outer_index*Ni + lg_rank%Ni;
1449             if (i < end) {
1450               PetscScalar *pv = ba_d + bdiag_d[i];
1451               *pv = 1.0/(*pv);
1452             }
1453           });
1454       });
1455   }
1456   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1457   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1458   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1459 
1460   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1461   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1462   if (b->inode.size) {
1463     B->ops->solve = MatSolve_SeqAIJ_Inode;
1464   } else if (row_identity && col_identity) {
1465     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1466   } else {
1467     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1468   }
1469   B->offloadmask = PETSC_OFFLOAD_GPU;
1470   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1471   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1472   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1473   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1474   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1475   B->assembled              = PETSC_TRUE;
1476   B->preallocated           = PETSC_TRUE;
1477 
1478   PetscFunctionReturn(0);
1479 }
1480 
1481 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1482 {
1483   PetscErrorCode   ierr;
1484 
1485   PetscFunctionBegin;
1486   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1487   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1492 {
1493   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1494 
1495   PetscFunctionBegin;
1496   if (!factors->sptrsv_symbolic_completed) {
1497     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1498     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1499     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1500   }
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 /* Check if we need to update factors etc for transpose solve */
1505 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1506 {
1507   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1508   MatColIdxType             n = A->rmap->n;
1509 
1510   PetscFunctionBegin;
1511   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1512     /* Update L^T and do sptrsv symbolic */
1513     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1514     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1515     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1516     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1517 
1518     KokkosKernels::Impl::transpose_matrix<
1519       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1520       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1521       MatRowMapKokkosView,DefaultExecutionSpace>(
1522         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1523         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1524 
1525     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1526       We have to sort the indices, until KK provides finer control options.
1527     */
1528     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1529       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1530         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1531 
1532     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1533 
1534     /* Update U^T and do sptrsv symbolic */
1535     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1536     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1537     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1538     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1539 
1540     KokkosKernels::Impl::transpose_matrix<
1541       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1542       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1543       MatRowMapKokkosView,DefaultExecutionSpace>(
1544         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1545         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1546 
1547     /* Sort indices. See comments above */
1548     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1549       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1550         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1551 
1552     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1553     factors->transpose_updated = PETSC_TRUE;
1554   }
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 /* Solve Ax = b, with A = LU */
1559 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1560 {
1561   PetscErrorCode                 ierr;
1562   ConstPetscScalarKokkosView     bv;
1563   PetscScalarKokkosView          xv;
1564   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1565 
1566   PetscFunctionBegin;
1567   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1568   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1569   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1570   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1571   /* Solve L tmpv = b */
1572   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1573   /* Solve Ux = tmpv */
1574   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1575   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1576   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1577   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 /* Solve A^T x = b, where A^T = U^T L^T */
1582 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1583 {
1584   PetscErrorCode                 ierr;
1585   ConstPetscScalarKokkosView     bv;
1586   PetscScalarKokkosView          xv;
1587   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1588 
1589   PetscFunctionBegin;
1590   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1591   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1592   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1593   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1594   /* Solve U^T tmpv = b */
1595   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1596 
1597   /* Solve L^T x = tmpv */
1598   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1599   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1600   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1601   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1606 {
1607   PetscErrorCode                 ierr;
1608   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1609   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1610   PetscInt                       fill_lev = info->levels;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1614   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1615 
1616   auto a_d = aijkok->a_dual.view_device();
1617   auto i_d = aijkok->i_dual.view_device();
1618   auto j_d = aijkok->j_dual.view_device();
1619 
1620   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1621 
1622   B->assembled                       = PETSC_TRUE;
1623   B->preallocated                    = PETSC_TRUE;
1624   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1625   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1626   B->ops->matsolve                   = NULL;
1627   B->ops->matsolvetranspose          = NULL;
1628   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1629 
1630   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1631   factors->transpose_updated         = PETSC_FALSE;
1632   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1633   /* TODO: log flops, but how to know that? */
1634   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1639 {
1640   PetscErrorCode                 ierr;
1641   Mat_SeqAIJKokkos               *aijkok;
1642   Mat_SeqAIJ                     *b;
1643   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1644   PetscInt                       fill_lev = info->levels;
1645   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1646   PetscInt                       n = A->rmap->n;
1647 
1648   PetscFunctionBegin;
1649   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1650   /* Rebuild factors */
1651   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1652   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1653 
1654   /* Create a spiluk handle and then do symbolic factorization */
1655   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1656   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1657 
1658   auto spiluk_handle = factors->kh.get_spiluk_handle();
1659 
1660   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1661   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1662   Kokkos::realloc(factors->iU_d,n+1);
1663   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1664 
1665   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1666   auto i_d = aijkok->i_dual.view_device();
1667   auto j_d = aijkok->j_dual.view_device();
1668   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1669   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1670 
1671   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1672   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1673   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1674   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1675 
1676   /* TODO: add options to select sptrsv algorithms */
1677   /* Create sptrsv handles for L, U and their transpose */
1678  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1679   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1680  #else
1681   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1682  #endif
1683 
1684   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1685   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1686   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1687   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1688 
1689   /* Fill fields of the factor matrix B */
1690   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1691   b     = (Mat_SeqAIJ*)B->data;
1692   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1693   B->info.fill_ratio_given  = info->fill;
1694   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1695 
1696   B->offloadmask            = PETSC_OFFLOAD_GPU;
1697   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1702 {
1703   PetscErrorCode   ierr;
1704   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1705   const PetscInt   nrows   = A->rmap->n;
1706 
1707   PetscFunctionBegin;
1708   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1709   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1710   // move B data into Kokkos
1711   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1712   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1713   {
1714     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1715     if (!baijkok->diag_d.extent(0)) {
1716       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1717       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1718       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1719     }
1720   }
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1725 {
1726   PetscFunctionBegin;
1727   *type = MATSOLVERKOKKOS;
1728   PetscFunctionReturn(0);
1729 }
1730 
1731 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1732 {
1733   PetscFunctionBegin;
1734   *type = MATSOLVERKOKKOSDEVICE;
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 /*MC
1739   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1740   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1741 
1742   Level: beginner
1743 
1744 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1745 M*/
1746 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1747 {
1748   PetscErrorCode ierr;
1749   PetscInt       n = A->rmap->n;
1750 
1751   PetscFunctionBegin;
1752   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1753   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1754   (*B)->factortype = ftype;
1755   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1756   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1757 
1758   if (ftype == MAT_FACTOR_LU) {
1759     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1760     (*B)->canuseordering         = PETSC_TRUE;
1761     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1762   } else if (ftype == MAT_FACTOR_ILU) {
1763     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1764     (*B)->canuseordering         = PETSC_FALSE;
1765     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1766   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1767 
1768   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1769   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1770   PetscFunctionReturn(0);
1771 }
1772 
1773 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1774 {
1775   PetscErrorCode ierr;
1776   PetscInt       n = A->rmap->n;
1777 
1778   PetscFunctionBegin;
1779   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1780   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1781   (*B)->factortype = ftype;
1782   (*B)->canuseordering = PETSC_TRUE;
1783   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1784   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1785 
1786   if (ftype == MAT_FACTOR_LU) {
1787     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1788     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1789   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1790 
1791   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1792   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1793   PetscFunctionReturn(0);
1794 }
1795 
1796 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1797 {
1798   PetscErrorCode ierr;
1799 
1800   PetscFunctionBegin;
1801   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1802   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1803   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 /* Utility to print out a KokkosCsrMatrix for debugging */
1808 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1809 {
1810   PetscErrorCode    ierr;
1811   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1812   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1813   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1814   const PetscInt    *i = iv.data();
1815   const PetscInt    *j = jv.data();
1816   const PetscScalar *a = av.data();
1817   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1818 
1819   PetscFunctionBegin;
1820   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1821   for (PetscInt k=0; k<m; k++) {
1822     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1823     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1824       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1825     }
1826     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1827   }
1828   PetscFunctionReturn(0);
1829 }
1830