xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision db6f9c32f6b4da969b6725201769cd57afa2c7b5)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/aij.h>
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
22 
23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25    In the latter case, it is important to set a_dual's sync state correctly.
26  */
27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
28 {
29   PetscErrorCode    ierr;
30   Mat_SeqAIJ        *aijseq;
31   Mat_SeqAIJKokkos  *aijkok;
32 
33   PetscFunctionBegin;
34   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
35   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36 
37   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39 
40   /* If aijkok does not exist, we just copy i, j to device.
41      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42      In both cases, we build a new aijkok structure.
43   */
44   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45     delete aijkok;
46     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47     A->spptr = aijkok;
48   }
49 
50   if (aijkok && aijkok->device_mat_d.data()) {
51     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52   }
53   PetscFunctionReturn(0);
54 }
55 
56 /* Sync CSR data to device if not yet */
57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
58 {
59   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60 
61   PetscFunctionBegin;
62   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65   if (aijkok->a_dual.need_sync_device()) {
66     aijkok->a_dual.sync_device();
67     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
68     aijkok->hermitian_updated = PETSC_FALSE;
69   }
70   PetscFunctionReturn(0);
71 }
72 
73 /* Mark the CSR data on device as modified */
74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
75 {
76   PetscErrorCode   ierr;
77   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
78 
79   PetscFunctionBegin;
80   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
81   aijkok->a_dual.clear_sync_state();
82   aijkok->a_dual.modify_device();
83   aijkok->transpose_updated = PETSC_FALSE;
84   aijkok->hermitian_updated = PETSC_FALSE;
85   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
86   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
87   PetscFunctionReturn(0);
88 }
89 
90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91 {
92   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93 
94   PetscFunctionBegin;
95   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
96    /* We do not expect one needs factors on host  */
97   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99   aijkok->a_dual.sync_host();
100   PetscFunctionReturn(0);
101 }
102 
103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104 {
105   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106 
107   PetscFunctionBegin;
108   if (aijkok) {
109     aijkok->a_dual.sync_host();
110     *array = aijkok->a_dual.view_host().data();
111   } else { /* Happens when calling MatSetValues on a newly created matrix */
112     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118 {
119   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120 
121   PetscFunctionBegin;
122   if (aijkok) aijkok->a_dual.modify_host();
123   PetscFunctionReturn(0);
124 }
125 
126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok) {
132     aijkok->a_dual.sync_host();
133     *array = aijkok->a_dual.view_host().data();
134   } else {
135     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141 {
142   PetscFunctionBegin;
143   *array = NULL;
144   PetscFunctionReturn(0);
145 }
146 
147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148 {
149   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150 
151   PetscFunctionBegin;
152   if (aijkok) {
153     *array = aijkok->a_dual.view_host().data();
154   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
155     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161 {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok) {
166     aijkok->a_dual.clear_sync_state();
167     aijkok->a_dual.modify_host();
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174 {
175   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177 
178   PetscFunctionBegin;
179   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187 {
188   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189 
190   PetscFunctionBegin;
191   if (aijkok && aijkok->device_mat_d.data()) {
192     *d_mat = aijkok->device_mat_d.data();
193   } else {
194     PetscErrorCode   ierr;
195     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196     *d_mat  = NULL;
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 /* Generate the transpose on device and cache it internally */
202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203 {
204   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210     CHKERRCXX(aijkok->a_dual.sync_device());
211     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
213     aijkok->transpose_updated = PETSC_TRUE;
214   }
215   *csrmatT = &aijkok->csrmatT;
216   PetscFunctionReturn(0);
217 }
218 
219 /* Generate the Hermitian on device and cache it internally */
220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221 {
222   PetscErrorCode                   ierr;
223   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
224 
225   PetscFunctionBegin;
226   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
227   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
228   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
229     CHKERRCXX(aijkok->a_dual.sync_device());
230     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
231     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
232    #if defined(PETSC_USE_COMPLEX)
233     const auto& a = aijkok->csrmatH.values;
234     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
235    #endif
236     aijkok->hermitian_updated = PETSC_TRUE;
237   }
238   *csrmatH = &aijkok->csrmatH;
239   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 /* y = A x */
244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
245 {
246   PetscErrorCode                   ierr;
247   Mat_SeqAIJKokkos                 *aijkok;
248   ConstPetscScalarKokkosView       xv;
249   PetscScalarKokkosView            yv;
250 
251   PetscFunctionBegin;
252   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
255   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
257   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
258   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
259   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
260   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
261   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /* y = A^T x */
266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
267 {
268   PetscErrorCode                   ierr;
269   Mat_SeqAIJKokkos                 *aijkok;
270   const char                       *mode;
271   ConstPetscScalarKokkosView       xv;
272   PetscScalarKokkosView            yv;
273   KokkosCsrMatrix                  *csrmat;
274 
275   PetscFunctionBegin;
276   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
277   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
280   if (A->form_explicit_transpose) {
281     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282     mode = "N";
283   } else {
284     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285     csrmat = &aijkok->csrmat;
286     mode = "T";
287   }
288   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
291   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /* y = A^H x */
297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
298 {
299   PetscErrorCode                   ierr;
300   Mat_SeqAIJKokkos                 *aijkok;
301   const char                       *mode;
302   ConstPetscScalarKokkosView       xv;
303   PetscScalarKokkosView            yv;
304   KokkosCsrMatrix                  *csrmat;
305 
306   PetscFunctionBegin;
307   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
308   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
311   if (A->form_explicit_transpose) {
312     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313     mode = "N";
314   } else {
315     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316     csrmat = &aijkok->csrmat;
317     mode = "C";
318   }
319   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 /* z = A x + y */
328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
329 {
330   PetscErrorCode                   ierr;
331   Mat_SeqAIJKokkos                 *aijkok;
332   ConstPetscScalarKokkosView       xv,yv;
333   PetscScalarKokkosView            zv;
334 
335   PetscFunctionBegin;
336   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
337   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
341   if (zz != yy) Kokkos::deep_copy(zv,yv);
342   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 /* z = A^T x + y */
353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
354 {
355   PetscErrorCode                   ierr;
356   Mat_SeqAIJKokkos                 *aijkok;
357   const char                       *mode;
358   ConstPetscScalarKokkosView       xv,yv;
359   PetscScalarKokkosView            zv;
360   KokkosCsrMatrix                  *csrmat;
361 
362   PetscFunctionBegin;
363   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
364   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
368   if (zz != yy) Kokkos::deep_copy(zv,yv);
369   if (A->form_explicit_transpose) {
370     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374     csrmat = &aijkok->csrmat;
375     mode = "T";
376   }
377   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
381   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /* z = A^H x + y */
387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   PetscErrorCode                   ierr;
390   Mat_SeqAIJKokkos                 *aijkok;
391   const char                       *mode;
392   ConstPetscScalarKokkosView       xv,yv;
393   PetscScalarKokkosView            zv;
394   KokkosCsrMatrix                  *csrmat;
395 
396   PetscFunctionBegin;
397   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
398   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
402   if (zz != yy) Kokkos::deep_copy(zv,yv);
403   if (A->form_explicit_transpose) {
404     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode = "C";
410   }
411   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
415   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421 {
422   PetscErrorCode            ierr;
423   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424 
425   PetscFunctionBegin;
426   switch (op) {
427     case MAT_FORM_EXPLICIT_TRANSPOSE:
428       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430       A->form_explicit_transpose = flg;
431       break;
432     default:
433       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434       break;
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Depending on reuse, either build a new mat, or use the existing mat */
440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
441 {
442   PetscErrorCode   ierr;
443   Mat_SeqAIJ       *aseq;
444 
445   PetscFunctionBegin;
446   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
449   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
459       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
461     }
462   }
463   PetscFunctionReturn(0);
464 }
465 
466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468  */
469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
470 {
471   PetscErrorCode        ierr;
472   Mat_SeqAIJ            *bseq;
473   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474   Mat                   mat;
475 
476   PetscFunctionBegin;
477   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479   mat  = *B;
480   if (A->assembled) {
481     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484     /* Now copy values to B if needed */
485     if (dupOption == MAT_COPY_VALUES) {
486       if (akok->a_dual.need_sync_device()) {
487         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488         bkok->a_dual.modify_host();
489       } else { /* If device has the latest data, we only copy data on device */
490         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491         bkok->a_dual.modify_device();
492       }
493     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495       bkok->a_dual.modify_host();
496     }
497     mat->spptr = bkok;
498   }
499 
500   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
508 {
509   PetscErrorCode    ierr;
510   Mat               At;
511   KokkosCsrMatrix   *internT,*csrmatT;
512   Mat_SeqAIJKokkos  *atkok,*bkok;
513 
514   PetscFunctionBegin;
515   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
516   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
518     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
519     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
520     if (reuse == MAT_INITIAL_MATRIX) *B = At;
521     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
522   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
523     if ((*B)->assembled) {
524       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
525       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
526       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
527     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
528       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
529       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
530       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
531       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
532       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
533     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
539 {
540   PetscErrorCode             ierr;
541   Mat_SeqAIJKokkos           *aijkok;
542 
543   PetscFunctionBegin;
544   if (A->factortype == MAT_FACTOR_NONE) {
545     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546     if (aijkok) {
547       if (aijkok->device_mat_d.data()) {
548         delete aijkok->colmap_d;
549         delete aijkok->i_uncompressed_d;
550       }
551       if (aijkok->diag_d) delete aijkok->diag_d;
552     }
553     delete aijkok;
554   } else {
555     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
556   }
557   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
558   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
559   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
560   A->spptr = NULL;
561   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
565 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
566 {
567   PetscErrorCode ierr;
568 
569   PetscFunctionBegin;
570   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
571   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
572   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
573   PetscFunctionReturn(0);
574 }
575 
576 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
577 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
578 {
579   PetscErrorCode               ierr;
580   Mat_SeqAIJ                   *a,*b;
581   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
582   MatScalarKokkosView          aa,ba,ca;
583   MatRowMapKokkosView          ai,bi,ci;
584   MatColIdxKokkosView          aj,bj,cj;
585   PetscInt                     m,n,nnz,aN;
586 
587   PetscFunctionBegin;
588   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
589   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
590   PetscValidPointer(C,4);
591   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
592   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
593   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
594   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
595 
596   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
597   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
598   a    = static_cast<Mat_SeqAIJ*>(A->data);
599   b    = static_cast<Mat_SeqAIJ*>(B->data);
600   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
601   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
602   aa   = akok->a_dual.view_device();
603   ai   = akok->i_dual.view_device();
604   ba   = bkok->a_dual.view_device();
605   bi   = bkok->i_dual.view_device();
606   m    = A->rmap->n; /* M, N and nnz of C */
607   n    = A->cmap->n + B->cmap->n;
608   nnz  = a->nz + b->nz;
609   aN   = A->cmap->n; /* N of A */
610   if (reuse == MAT_INITIAL_MATRIX) {
611     aj = akok->j_dual.view_device();
612     bj = bkok->j_dual.view_device();
613     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
614     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
615     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
616     ca = ca_dual.view_device();
617     ci = ci_dual.view_device();
618     cj = cj_dual.view_device();
619 
620     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
621     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
622       PetscInt i = t.league_rank(); /* row i */
623       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
624 
625       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
626         ci(i) = coffset;
627         if (i == m-1) ci(m) = ai(m) + bi(m);
628       });
629 
630       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
631         if (k < alen) {
632           ca(coffset+k) = aa(ai(i)+k);
633           cj(coffset+k) = aj(ai(i)+k);
634         } else {
635           ca(coffset+k) = ba(bi(i)+k-alen);
636           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
637         }
638       });
639     });
640     ca_dual.modify_device();
641     ci_dual.modify_device();
642     cj_dual.modify_device();
643     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
644     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
645   } else if (reuse == MAT_REUSE_MATRIX) {
646     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
647     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
648     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
649     ca   = ckok->a_dual.view_device();
650     ci   = ckok->i_dual.view_device();
651 
652     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
653       PetscInt i = t.league_rank(); /* row i */
654       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
655       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
656         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
657         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
658       });
659     });
660     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
661   }
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
666 {
667   PetscFunctionBegin;
668   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
673 {
674   PetscErrorCode                 ierr;
675   Mat_Product                    *product = C->product;
676   Mat                            A,B;
677   bool                           transA,transB; /* use bool, since KK needs this type */
678   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
679   Mat_SeqAIJ                     *c;
680   MatProductData_SeqAIJKokkos    *pdata;
681   KokkosCsrMatrix                *csrmatA,*csrmatB;
682 
683   PetscFunctionBegin;
684   MatCheckProduct(C,1);
685   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
686   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
687 
688   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
689     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
690     PetscFunctionReturn(0);
691   }
692 
693   switch (product->type) {
694     case MATPRODUCT_AB:  transA = false; transB = false; break;
695     case MATPRODUCT_AtB: transA = true;  transB = false; break;
696     case MATPRODUCT_ABt: transA = false; transB = true;  break;
697     default:
698       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
699   }
700 
701   A     = product->A;
702   B     = product->B;
703   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
704   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
705   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
706   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
707   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
708 
709   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
710 
711   csrmatA = &akok->csrmat;
712   csrmatB = &bkok->csrmat;
713 
714   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
715   if (transA) {
716     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
717     transA = false;
718   }
719 
720   if (transB) {
721     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
722     transB = false;
723   }
724   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
725   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
726   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
727   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
728   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
729   /* shorter version of MatAssemblyEnd_SeqAIJ */
730   c = (Mat_SeqAIJ*)C->data;
731   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
732   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
733   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
734   c->reallocs         = 0;
735   C->info.mallocs     = 0;
736   C->info.nz_unneeded = 0;
737   C->assembled        = C->was_assembled = PETSC_TRUE;
738   C->num_ass++;
739   PetscFunctionReturn(0);
740 }
741 
742 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
743 {
744   PetscErrorCode                 ierr;
745   Mat_Product                    *product = C->product;
746   MatProductType                 ptype;
747   Mat                            A,B;
748   bool                           transA,transB;
749   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
750   MatProductData_SeqAIJKokkos    *pdata;
751   MPI_Comm                       comm;
752   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
753 
754   PetscFunctionBegin;
755   MatCheckProduct(C,1);
756   ierr = PetscObjectGetComm((PetscObject)C,&comm);
757   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
758   A       = product->A;
759   B       = product->B;
760   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
761   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
762   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
763   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
764   csrmatA = &akok->csrmat;
765   csrmatB = &bkok->csrmat;
766 
767   ptype   = product->type;
768   switch (ptype) {
769     case MATPRODUCT_AB:  transA = false; transB = false; break;
770     case MATPRODUCT_AtB: transA = true;  transB = false; break;
771     case MATPRODUCT_ABt: transA = false; transB = true;  break;
772     default:
773       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
774   }
775 
776   product->data = pdata = new MatProductData_SeqAIJKokkos();
777   pdata->kh.set_team_work_size(16);
778   pdata->kh.set_dynamic_scheduling(true);
779   pdata->reusesym = product->api_user;
780 
781   /* TODO: add command line options to select spgemm algorithms */
782   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
783   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
784      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
785    */
786   pdata->kh.create_spgemm_handle(spgemm_alg);
787 
788   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
789   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
790   if (transA) {
791     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
792     transA = false;
793   }
794 
795   if (transB) {
796     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
797     transB = false;
798   }
799 
800   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
801   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
802     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
803     calling new Mat_SeqAIJKokkos().
804     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
805   */
806   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
808   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
809 
810   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
811   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
812   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
813   PetscFunctionReturn(0);
814 }
815 
816 /* handles sparse matrix matrix ops */
817 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
818 {
819   PetscErrorCode ierr;
820   Mat_Product    *product = mat->product;
821   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat,1);
825   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
826   if (product->type == MATPRODUCT_ABC) {
827     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
828   }
829   if (Biskok && Ciskok) {
830     switch (product->type) {
831     case MATPRODUCT_AB:
832     case MATPRODUCT_AtB:
833     case MATPRODUCT_ABt:
834       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835       break;
836     case MATPRODUCT_PtAP:
837     case MATPRODUCT_RARt:
838     case MATPRODUCT_ABC:
839       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840       break;
841     default:
842       break;
843     }
844   } else { /* fallback for AIJ */
845     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
846   }
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851 {
852   PetscErrorCode   ierr;
853   Mat_SeqAIJKokkos *aijkok;
854 
855   PetscFunctionBegin;
856   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
857   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
858   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
859   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
860   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
861   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
862   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
867 {
868   PetscErrorCode   ierr;
869   Mat_SeqAIJKokkos *aijkok;
870 
871   PetscFunctionBegin;
872   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
873   if (aijkok) { /* Only zero the device if data is already there */
874     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
875     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
876   } else { /* Might be preallocated but not assembled */
877     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
878   }
879   PetscFunctionReturn(0);
880 }
881 
882 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
883 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
884 {
885   PetscErrorCode     ierr;
886   Mat_SeqAIJKokkos   *aijkok;
887 
888   PetscFunctionBegin;
889   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890   PetscValidPointer(kv,2);
891   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
893   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
894   *kv    = aijkok->a_dual.view_device();
895   PetscFunctionReturn(0);
896 }
897 
898 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
899 {
900   PetscFunctionBegin;
901   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902   PetscValidPointer(kv,2);
903   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904   PetscFunctionReturn(0);
905 }
906 
907 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
908 {
909   PetscErrorCode     ierr;
910   Mat_SeqAIJKokkos   *aijkok;
911 
912   PetscFunctionBegin;
913   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
914   PetscValidPointer(kv,2);
915   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
916   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
917   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
918   *kv    = aijkok->a_dual.view_device();
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
923 {
924   PetscErrorCode     ierr;
925 
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
935 {
936   Mat_SeqAIJKokkos   *aijkok;
937 
938   PetscFunctionBegin;
939   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
940   PetscValidPointer(kv,2);
941   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
942   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
943   *kv    = aijkok->a_dual.view_device();
944   PetscFunctionReturn(0);
945 }
946 
947 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
948 {
949   PetscErrorCode     ierr;
950 
951   PetscFunctionBegin;
952   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
953   PetscValidPointer(kv,2);
954   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
955   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
956   PetscFunctionReturn(0);
957 }
958 
959 /* Computes Y += alpha X */
960 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
961 {
962   PetscErrorCode             ierr;
963   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
964   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
965   ConstMatScalarKokkosView   Xa;
966   MatScalarKokkosView        Ya;
967 
968   PetscFunctionBegin;
969   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
970   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
971   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
972   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
973   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
974 
975   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
976     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
977     PetscBool e;
978     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
979     if (e) {
980       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
981       if (e) pattern = SAME_NONZERO_PATTERN;
982     }
983   }
984 
985   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
986     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
987     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
988     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
989   */
990   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
991   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
992   Xa   = xkok->a_dual.view_device();
993   Ya   = ykok->a_dual.view_device();
994 
995   if (pattern == SAME_NONZERO_PATTERN) {
996     KokkosBlas::axpy(alpha,Xa,Ya);
997     ierr = MatSeqAIJKokkosModifyDevice(Y);
998   } else if (pattern == SUBSET_NONZERO_PATTERN) {
999     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1000     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1001 
1002     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1003       PetscInt i = t.league_rank(); /* row i */
1004       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1005         PetscInt p,q = Yi(i);
1006         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1007           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1008           if (Xj(p) == Yj(q)) { /* Found it */
1009             Ya(q) += alpha * Xa(p);
1010             q++;
1011           } else {
1012             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1013                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1014             */
1015             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1016           }
1017         }
1018       });
1019     });
1020     ierr = MatSeqAIJKokkosModifyDevice(Y);
1021   } else { /* different nonzero patterns */
1022     Mat             Z;
1023     KokkosCsrMatrix zcsr;
1024     KernelHandle    kh;
1025     kh.create_spadd_handle(false);
1026     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1027     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1028     zkok = new Mat_SeqAIJKokkos(zcsr);
1029     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1030     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1031     kh.destroy_spadd_handle();
1032   }
1033   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1035   PetscFunctionReturn(0);
1036 }
1037 
1038 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1039 {
1040   PetscErrorCode            ierr;
1041   MPI_Comm                  comm;
1042   PetscInt                  *i,*j,*perm,*jmap;
1043   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
1044   PetscBool                 has_repeats = PETSC_FALSE;
1045   PetscInt                  *Ai,*Aj;
1046   PetscScalar               *Aa;
1047   Mat_SeqAIJKokkos          *akok;
1048   Mat_SeqAIJ                *aseq;
1049   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
1050   Mat                       newmat;
1051 
1052   PetscFunctionBegin;
1053   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1054   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1055   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
1056   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1057   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
1058   perm = perm_h.data();
1059   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
1060     if (j[k] < 0) i[k] = -1;
1061     perm[k] = k;
1062   }
1063 
1064   /* Sort by row */
1065   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
1066   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
1067   nneg = k;
1068   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
1069   nnz  = 0; /* Total number of unique nonzeros to be counted */
1070   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
1071   jmap++; /* Inc jmap by 1 for convinience */
1072 
1073   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
1074   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
1075 
1076   /* In each row, sort by column, then unique column indices to get row length */
1077   Ai++; /* Inc by 1 for convinience */
1078   q = 0; /* q-th unique nonzero, with q starting from 0 */
1079   while (k<n) {
1080     row   = i[k];
1081     start = k; /* [start,end) indices for this row */
1082     while (k<n && i[k] == row) k++;
1083     end   = k;
1084     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
1085     /* Find number of unique col entries in this row */
1086     Aj[q]   = j[start]; /* Log the first nonzero in this row */
1087     jmap[q] = 1; /* Number of repeats of this nozero entry */
1088     Ai[row] = 1;
1089     nnz++;
1090 
1091     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
1092       if (j[p] != j[p-1]) { /* Meet a new nonzero */
1093         q++;
1094         jmap[q] = 1;
1095         Aj[q]   = j[p];
1096         Ai[row]++;
1097         nnz++;
1098       } else {
1099         jmap[q]++;
1100         has_repeats = PETSC_TRUE;
1101       }
1102     }
1103     q++; /* Move to next row and thus next unique nonzero */
1104   }
1105   ierr = PetscFree2(i,j);CHKERRQ(ierr);
1106 
1107   Ai--; /* Back to the beginning of Ai[] */
1108   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
1109   jmap--; /* Back to the beginning of jmap[] */
1110   jmap[0] = 0;
1111   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
1112     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
1113     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
1114   }
1115 
1116   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
1117     PetscInt *Aj_new;
1118     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
1119     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
1120     ierr = PetscFree(Aj);CHKERRQ(ierr);
1121     Aj = Aj_new;
1122   }
1123 
1124   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
1125   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
1126   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
1127   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
1128   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
1129 
1130   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1131   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1132   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1133 
1134   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
1135     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
1136     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
1137     perm_h = newperm_h;
1138   }
1139 
1140   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1141   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1146 {
1147   PetscErrorCode              ierr;
1148   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1149   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1150   PetscInt                    nz = aseq->nz;
1151   const MatRowMapKokkosView&  jmap = akok->jmap_d;
1152   const MatRowMapKokkosView&  perm = akok->perm_d;
1153   MatScalarKokkosView         Aa;
1154   ConstMatScalarKokkosView    kv;
1155   PetscMemType                memtype;
1156 
1157   PetscFunctionBegin;
1158   if (!v) { /* NULL v means an all zero array */
1159     ierr = MatZeroEntries(A);CHKERRQ(ierr);
1160     PetscFunctionReturn(0);
1161   }
1162 
1163   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1164   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1165     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
1166   } else {
1167     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
1168   }
1169 
1170   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1171   if (imode == INSERT_VALUES) {
1172     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1173   }
1174 
1175   if (akok->coo_has_repeats) {
1176     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
1177       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1178     });
1179   } else {
1180     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
1181   }
1182   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1183   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1184   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1189 {
1190   PetscErrorCode ierr;
1191 
1192   PetscFunctionBegin;
1193   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1194   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1195   B->offloadmask = PETSC_OFFLOAD_CPU;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1200 {
1201   PetscErrorCode     ierr;
1202   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1203 
1204   PetscFunctionBegin;
1205   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1206   A->boundtocpu  = PETSC_FALSE;
1207 
1208   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1209   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1210   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1211   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1212   A->ops->scale                     = MatScale_SeqAIJKokkos;
1213   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1214   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1215   A->ops->mult                      = MatMult_SeqAIJKokkos;
1216   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1217   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1218   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1219   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1220   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1221   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1222   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1223   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1224   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1225   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1226   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1227   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1228   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1229   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1230 
1231   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1232   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1233   PetscFunctionReturn(0);
1234 }
1235 
1236 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1237 {
1238   PetscErrorCode ierr;
1239   Mat_SeqAIJ     *aseq;
1240   PetscInt       i,m,n;
1241 
1242   PetscFunctionBegin;
1243   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1244 
1245   m    = akok->nrows();
1246   n    = akok->ncols();
1247   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1248   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1249 
1250   /* Set up data structures of A as a MATSEQAIJ */
1251   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1252   aseq = (Mat_SeqAIJ*)(A)->data;
1253 
1254   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1255   akok->j_dual.sync_host();
1256 
1257   aseq->i            = akok->i_host_data();
1258   aseq->j            = akok->j_host_data();
1259   aseq->a            = akok->a_host_data();
1260   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1261   aseq->singlemalloc = PETSC_FALSE;
1262   aseq->free_a       = PETSC_FALSE;
1263   aseq->free_ij      = PETSC_FALSE;
1264   aseq->nz           = akok->nnz();
1265   aseq->maxnz        = aseq->nz;
1266 
1267   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1268   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1269   for (i=0; i<m; i++) {
1270     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1271   }
1272 
1273   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274   akok->nonzerostate = A->nonzerostate;
1275   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1276   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1277   A->spptr = akok;
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1282 
1283    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284  */
1285 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1286 {
1287   PetscErrorCode ierr;
1288 
1289   PetscFunctionBegin;
1290   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1291   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 /* --------------------------------------------------------------------------------*/
1296 /*@C
1297    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1298    (the default parallel PETSc format). This matrix will ultimately be handled by
1299    Kokkos for calculations. For good matrix
1300    assembly performance the user should preallocate the matrix storage by setting
1301    the parameter nz (or the array nnz).  By setting these parameters accurately,
1302    performance during matrix assembly can be increased by more than a factor of 50.
1303 
1304    Collective
1305 
1306    Input Parameters:
1307 +  comm - MPI communicator, set to PETSC_COMM_SELF
1308 .  m - number of rows
1309 .  n - number of columns
1310 .  nz - number of nonzeros per row (same for all rows)
1311 -  nnz - array containing the number of nonzeros in the various rows
1312          (possibly different for each row) or NULL
1313 
1314    Output Parameter:
1315 .  A - the matrix
1316 
1317    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1318    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1319    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1320 
1321    Notes:
1322    If nnz is given then nz is ignored
1323 
1324    The AIJ format (also called the Yale sparse matrix format or
1325    compressed row storage), is fully compatible with standard Fortran 77
1326    storage.  That is, the stored row and column indices can begin at
1327    either one (as in Fortran) or zero.  See the users' manual for details.
1328 
1329    Specify the preallocated storage with either nz or nnz (not both).
1330    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1331    allocation.  For large problems you MUST preallocate memory or you
1332    will get TERRIBLE performance, see the users' manual chapter on matrices.
1333 
1334    By default, this format uses inodes (identical nodes) when possible, to
1335    improve numerical efficiency of matrix-vector products and solves. We
1336    search for consecutive rows with the same nonzero structure, thereby
1337    reusing matrix information to achieve increased efficiency.
1338 
1339    Level: intermediate
1340 
1341 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1342 @*/
1343 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1344 {
1345   PetscErrorCode ierr;
1346 
1347   PetscFunctionBegin;
1348   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1349   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1350   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1351   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1352   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 typedef Kokkos::TeamPolicy<>::member_type team_member;
1357 //
1358 // This factorization exploits block diagonal matrices with "Nf" (not used).
1359 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1360 //
1361 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1362 {
1363   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1364   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1365   IS                 isrow = b->row,isicol = b->icol;
1366   PetscErrorCode     ierr;
1367   const PetscInt     *r_h,*ic_h;
1368   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1369   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1370   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1371   PetscBool          row_identity,col_identity;
1372   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1373 
1374   PetscFunctionBegin;
1375   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1376   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1377   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1378   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1379   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1380   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1381   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1382   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1383   {
1384 #define KOKKOS_SHARED_LEVEL 1
1385     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1386     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1387     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1388     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1389     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1390     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1391     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1392     size_t flops_h = 0.0;
1393     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1394     Kokkos::View<size_t> d_flops_k ("flops");
1395     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1396     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1397     Kokkos::deep_copy (d_flops_k, h_flops_k);
1398     Kokkos::deep_copy (d_r_k, h_r_k);
1399     Kokkos::deep_copy (d_ic_k, h_ic_k);
1400     // Fill A --> fact
1401     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1402         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1403         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1404         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1405         // zero rows of B
1406         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1407             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1408             PetscScalar *baL = ba_d + bi_d[rowb];
1409             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1410             /* zero (unfactored row) */
1411             for (int j=0;j<nzbL;j++) baL[j] = 0;
1412             for (int j=0;j<nzbU;j++) baU[j] = 0;
1413           });
1414         // copy A into B
1415         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1416             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1417             const PetscScalar *av    = aa_d + ai_d[rowa];
1418             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1419             /* load in initial (unfactored row) */
1420             for (int j=0;j<nza;j++) {
1421               PetscInt    colb = ic[ajtmp[j]];
1422               PetscScalar vala = av[j];
1423               if (colb == rowb) {
1424                 *(ba_d + bdiag_d[rowb]) = vala;
1425               } else {
1426                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1427                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1428                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1429                 for (int j=0; j<nz ; j++) {
1430                   if (pbj[j] == colb) {
1431                     pba[j] = vala;
1432                     set++;
1433                     break;
1434                   }
1435                 }
1436                #if !defined(PETSC_HAVE_SYCL)
1437                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1438                #endif
1439               }
1440             }
1441           });
1442       });
1443     Kokkos::fence();
1444 
1445     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1446         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1447         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1448         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1449         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1450         const PetscInt  start = field*nloc, end = start + nloc;
1451         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1452         // A22 panel update for each row A(1,:) and col A(:,1)
1453         for (int ii=start; ii<end-1; ii++) {
1454           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1455           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1456           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1457           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1458           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1459               PetscInt kIdx = j*Ni + field_block_idx;
1460               if (kIdx >= nzUi) /* void */ ;
1461               else {
1462                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1463                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1464                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1465                 size_t         st_idx;
1466                 // find and do L(k,i) = A(:k,i) / A(i,i)
1467                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1468                 // get column, there has got to be a better way
1469                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1470                     if (pjL[j] == ii) {
1471                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1472                       idx = j; // output
1473                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1474                     }
1475                 }, st_idx);
1476                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1477 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1478                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1479 #endif
1480                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1481                 // U(i+1,:end)
1482                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1483                       PetscScalar Uij = baUi[uiIdx];
1484                       PetscInt    col = bjUi[uiIdx];
1485                       if (col==myk) {
1486                         // A_kk = A_kk - L_ki * U_ij(k)
1487                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1488                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1489                       } else {
1490                         PetscScalar    *start, *end, *pAkjv=NULL;
1491                         PetscInt       high, low;
1492                         const PetscInt *startj;
1493                         if (col<myk) { // L
1494                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1495                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1496                           start = pLki+1; // start at pLki+1, A22(myk,1)
1497                           startj= bj_d + bi_d[myk] + idx;
1498                           end   = ba_d + bi_d[myk+1];
1499                         } else {
1500                           PetscInt idx = bdiag_d[myk+1]+1;
1501                           start = ba_d + idx;
1502                           startj= bj_d + idx;
1503                           end   = ba_d + bdiag_d[myk];
1504                         }
1505                         // search for 'col', use bisection search - TODO
1506                         low  = 0;
1507                         high = (PetscInt)(end-start);
1508                         while (high-low > 5) {
1509                           int t = (low+high)/2;
1510                           if (startj[t] > col) high = t;
1511                           else                 low  = t;
1512                         }
1513                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1514                           if (startj[pAkjv-start] == col) break;
1515                         }
1516 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1517                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1518 #endif
1519                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1520                       }
1521                     });
1522               }
1523             });
1524           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1525           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1526         } /* endof for (i=0; i<n; i++) { */
1527         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1528       });
1529     Kokkos::fence();
1530     Kokkos::deep_copy (h_flops_k, d_flops_k);
1531     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1532     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1533         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1534         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1535         /* Invert diagonal for simpler triangular solves */
1536         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1537             int i = start + outer_index*Ni + lg_rank%Ni;
1538             if (i < end) {
1539               PetscScalar *pv = ba_d + bdiag_d[i];
1540               *pv = 1.0/(*pv);
1541             }
1542           });
1543       });
1544   }
1545   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1546   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1547   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1548 
1549   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1550   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1551   if (b->inode.size) {
1552     B->ops->solve = MatSolve_SeqAIJ_Inode;
1553   } else if (row_identity && col_identity) {
1554     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1555   } else {
1556     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1557   }
1558   B->offloadmask = PETSC_OFFLOAD_GPU;
1559   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1560   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1561   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1562   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1563   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1564   B->assembled              = PETSC_TRUE;
1565   B->preallocated           = PETSC_TRUE;
1566 
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1571 {
1572   PetscErrorCode   ierr;
1573 
1574   PetscFunctionBegin;
1575   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1576   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1581 {
1582   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1583 
1584   PetscFunctionBegin;
1585   if (!factors->sptrsv_symbolic_completed) {
1586     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1587     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1588     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1589   }
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 /* Check if we need to update factors etc for transpose solve */
1594 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1595 {
1596   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1597   MatColIdxType             n = A->rmap->n;
1598 
1599   PetscFunctionBegin;
1600   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1601     /* Update L^T and do sptrsv symbolic */
1602     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1603     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1604     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1605     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1606 
1607     KokkosKernels::Impl::transpose_matrix<
1608       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1609       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1610       MatRowMapKokkosView,DefaultExecutionSpace>(
1611         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1612         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1613 
1614     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1615       We have to sort the indices, until KK provides finer control options.
1616     */
1617     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1618       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1619         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1620 
1621     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1622 
1623     /* Update U^T and do sptrsv symbolic */
1624     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1625     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1626     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1627     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1628 
1629     KokkosKernels::Impl::transpose_matrix<
1630       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1631       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1632       MatRowMapKokkosView,DefaultExecutionSpace>(
1633         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1634         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1635 
1636     /* Sort indices. See comments above */
1637     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1638       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1639         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1640 
1641     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1642     factors->transpose_updated = PETSC_TRUE;
1643   }
1644   PetscFunctionReturn(0);
1645 }
1646 
1647 /* Solve Ax = b, with A = LU */
1648 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1649 {
1650   PetscErrorCode                 ierr;
1651   ConstPetscScalarKokkosView     bv;
1652   PetscScalarKokkosView          xv;
1653   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1654 
1655   PetscFunctionBegin;
1656   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1657   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1658   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1659   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1660   /* Solve L tmpv = b */
1661   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1662   /* Solve Ux = tmpv */
1663   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1664   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1665   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1666   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 /* Solve A^T x = b, where A^T = U^T L^T */
1671 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1672 {
1673   PetscErrorCode                 ierr;
1674   ConstPetscScalarKokkosView     bv;
1675   PetscScalarKokkosView          xv;
1676   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1677 
1678   PetscFunctionBegin;
1679   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1680   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1681   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1682   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1683   /* Solve U^T tmpv = b */
1684   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1685 
1686   /* Solve L^T x = tmpv */
1687   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1688   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1689   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1690   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1695 {
1696   PetscErrorCode                 ierr;
1697   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1698   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1699   PetscInt                       fill_lev = info->levels;
1700 
1701   PetscFunctionBegin;
1702   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1703   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1704 
1705   auto a_d = aijkok->a_dual.view_device();
1706   auto i_d = aijkok->i_dual.view_device();
1707   auto j_d = aijkok->j_dual.view_device();
1708 
1709   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1710 
1711   B->assembled                       = PETSC_TRUE;
1712   B->preallocated                    = PETSC_TRUE;
1713   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1714   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1715   B->ops->matsolve                   = NULL;
1716   B->ops->matsolvetranspose          = NULL;
1717   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1718 
1719   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1720   factors->transpose_updated         = PETSC_FALSE;
1721   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1722   /* TODO: log flops, but how to know that? */
1723   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1728 {
1729   PetscErrorCode                 ierr;
1730   Mat_SeqAIJKokkos               *aijkok;
1731   Mat_SeqAIJ                     *b;
1732   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1733   PetscInt                       fill_lev = info->levels;
1734   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1735   PetscInt                       n = A->rmap->n;
1736 
1737   PetscFunctionBegin;
1738   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1739   /* Rebuild factors */
1740   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1741   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1742 
1743   /* Create a spiluk handle and then do symbolic factorization */
1744   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1745   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1746 
1747   auto spiluk_handle = factors->kh.get_spiluk_handle();
1748 
1749   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1750   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1751   Kokkos::realloc(factors->iU_d,n+1);
1752   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1753 
1754   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1755   auto i_d = aijkok->i_dual.view_device();
1756   auto j_d = aijkok->j_dual.view_device();
1757   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1758   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1759 
1760   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1761   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1762   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1763   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1764 
1765   /* TODO: add options to select sptrsv algorithms */
1766   /* Create sptrsv handles for L, U and their transpose */
1767  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1768   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1769  #else
1770   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1771  #endif
1772 
1773   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1774   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1775   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1776   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1777 
1778   /* Fill fields of the factor matrix B */
1779   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1780   b     = (Mat_SeqAIJ*)B->data;
1781   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1782   B->info.fill_ratio_given  = info->fill;
1783   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1784 
1785   B->offloadmask            = PETSC_OFFLOAD_GPU;
1786   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1787   PetscFunctionReturn(0);
1788 }
1789 
1790 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1791 {
1792   PetscErrorCode   ierr;
1793   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1794   const PetscInt   nrows   = A->rmap->n;
1795 
1796   PetscFunctionBegin;
1797   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1798   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1799   // move B data into Kokkos
1800   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1801   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1802   {
1803     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1804     if (!baijkok->diag_d) {
1805       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1806       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1807       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1808     }
1809   }
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1814 {
1815   PetscFunctionBegin;
1816   *type = MATSOLVERKOKKOS;
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1821 {
1822   PetscFunctionBegin;
1823   *type = MATSOLVERKOKKOSDEVICE;
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 /*MC
1828   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1829   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1830 
1831   Level: beginner
1832 
1833 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1834 M*/
1835 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1836 {
1837   PetscErrorCode ierr;
1838   PetscInt       n = A->rmap->n;
1839 
1840   PetscFunctionBegin;
1841   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1842   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1843   (*B)->factortype = ftype;
1844   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1845   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1846 
1847   if (ftype == MAT_FACTOR_LU) {
1848     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1849     (*B)->canuseordering         = PETSC_TRUE;
1850     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1851   } else if (ftype == MAT_FACTOR_ILU) {
1852     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1853     (*B)->canuseordering         = PETSC_FALSE;
1854     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1855   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1856 
1857   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1858   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1863 {
1864   PetscErrorCode ierr;
1865   PetscInt       n = A->rmap->n;
1866 
1867   PetscFunctionBegin;
1868   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1869   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1870   (*B)->factortype = ftype;
1871   (*B)->canuseordering = PETSC_TRUE;
1872   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1873   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1874 
1875   if (ftype == MAT_FACTOR_LU) {
1876     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1877     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1878   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1879 
1880   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1881   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1882   PetscFunctionReturn(0);
1883 }
1884 
1885 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1886 {
1887   PetscErrorCode ierr;
1888 
1889   PetscFunctionBegin;
1890   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1891   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1892   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1893   PetscFunctionReturn(0);
1894 }
1895 
1896 /* Utility to print out a KokkosCsrMatrix for debugging */
1897 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1898 {
1899   PetscErrorCode    ierr;
1900   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1901   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1902   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1903   const PetscInt    *i = iv.data();
1904   const PetscInt    *j = jv.data();
1905   const PetscScalar *a = av.data();
1906   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1907 
1908   PetscFunctionBegin;
1909   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1910   for (PetscInt k=0; k<m; k++) {
1911     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1912     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1913       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1914     }
1915     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1916   }
1917   PetscFunctionReturn(0);
1918 }
1919