xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7ee59b9b2404baa50ab85d4479d0e2bce942d39e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok && aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   if (aijkok) {
105     aijkok->a_dual.sync_host();
106     *array = aijkok->a_dual.view_host().data();
107   } else { /* Happens when calling MatSetValues on a newly created matrix */
108     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114 {
115   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116 
117   PetscFunctionBegin;
118   if (aijkok) aijkok->a_dual.modify_host();
119   PetscFunctionReturn(0);
120 }
121 
122 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123 {
124   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125 
126   PetscFunctionBegin;
127   if (aijkok) {
128     aijkok->a_dual.sync_host();
129     *array = aijkok->a_dual.view_host().data();
130   } else {
131     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
132   }
133   PetscFunctionReturn(0);
134 }
135 
136 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137 {
138   PetscFunctionBegin;
139   *array = NULL;
140   PetscFunctionReturn(0);
141 }
142 
143 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144 {
145   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146 
147   PetscFunctionBegin;
148   if (aijkok) {
149     *array = aijkok->a_dual.view_host().data();
150   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
151     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
152   }
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157 {
158   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159 
160   PetscFunctionBegin;
161   if (aijkok) {
162     aijkok->a_dual.clear_sync_state();
163     aijkok->a_dual.modify_host();
164   }
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
169 {
170   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
171 
172   PetscFunctionBegin;
173   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
174 
175   if (i) *i = aijkok->i_device_data();
176   if (j) *j = aijkok->j_device_data();
177   if (a) {
178     aijkok->a_dual.sync_device();
179     *a = aijkok->a_device_data();
180   }
181   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
186 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
187 {
188   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
190 
191   PetscFunctionBegin;
192   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
193   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
194   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
195   PetscFunctionReturn(0);
196 }
197 
198 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
199 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
200 {
201   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
202 
203   PetscFunctionBegin;
204   if (aijkok && aijkok->device_mat_d.data()) {
205     *d_mat = aijkok->device_mat_d.data();
206   } else {
207     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
208     *d_mat  = NULL;
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /* Generate the transpose on device and cache it internally */
214 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
215 {
216   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
217 
218   PetscFunctionBegin;
219   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
222     PetscCallCXX(aijkok->a_dual.sync_device());
223     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
224     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
225     aijkok->transpose_updated = PETSC_TRUE;
226   }
227   *csrmatT = &aijkok->csrmatT;
228   PetscFunctionReturn(0);
229 }
230 
231 /* Generate the Hermitian on device and cache it internally */
232 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233 {
234   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
235 
236   PetscFunctionBegin;
237   PetscCall(PetscLogGpuTimeBegin());
238   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
239   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
240     PetscCallCXX(aijkok->a_dual.sync_device());
241     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
242     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
243    #if defined(PETSC_USE_COMPLEX)
244     const auto& a = aijkok->csrmatH.values;
245     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
246    #endif
247     aijkok->hermitian_updated = PETSC_TRUE;
248   }
249   *csrmatH = &aijkok->csrmatH;
250   PetscCall(PetscLogGpuTimeEnd());
251   PetscFunctionReturn(0);
252 }
253 
254 /* y = A x */
255 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
256 {
257   Mat_SeqAIJKokkos                 *aijkok;
258   ConstPetscScalarKokkosView       xv;
259   PetscScalarKokkosView            yv;
260 
261   PetscFunctionBegin;
262   PetscCall(PetscLogGpuTimeBegin());
263   PetscCall(MatSeqAIJKokkosSyncDevice(A));
264   PetscCall(VecGetKokkosView(xx,&xv));
265   PetscCall(VecGetKokkosViewWrite(yy,&yv));
266   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
267   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
268   PetscCall(VecRestoreKokkosView(xx,&xv));
269   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
270   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
271   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
272   PetscCall(PetscLogGpuTimeEnd());
273   PetscFunctionReturn(0);
274 }
275 
276 /* y = A^T x */
277 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
278 {
279   Mat_SeqAIJKokkos                 *aijkok;
280   const char                       *mode;
281   ConstPetscScalarKokkosView       xv;
282   PetscScalarKokkosView            yv;
283   KokkosCsrMatrix                  *csrmat;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscLogGpuTimeBegin());
287   PetscCall(MatSeqAIJKokkosSyncDevice(A));
288   PetscCall(VecGetKokkosView(xx,&xv));
289   PetscCall(VecGetKokkosViewWrite(yy,&yv));
290   if (A->form_explicit_transpose) {
291     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
292     mode = "N";
293   } else {
294     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
295     csrmat = &aijkok->csrmat;
296     mode = "T";
297   }
298   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
299   PetscCall(VecRestoreKokkosView(xx,&xv));
300   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
301   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
302   PetscCall(PetscLogGpuTimeEnd());
303   PetscFunctionReturn(0);
304 }
305 
306 /* y = A^H x */
307 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
308 {
309   Mat_SeqAIJKokkos                 *aijkok;
310   const char                       *mode;
311   ConstPetscScalarKokkosView       xv;
312   PetscScalarKokkosView            yv;
313   KokkosCsrMatrix                  *csrmat;
314 
315   PetscFunctionBegin;
316   PetscCall(PetscLogGpuTimeBegin());
317   PetscCall(MatSeqAIJKokkosSyncDevice(A));
318   PetscCall(VecGetKokkosView(xx,&xv));
319   PetscCall(VecGetKokkosViewWrite(yy,&yv));
320   if (A->form_explicit_transpose) {
321     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
322     mode = "N";
323   } else {
324     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325     csrmat = &aijkok->csrmat;
326     mode = "C";
327   }
328   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
329   PetscCall(VecRestoreKokkosView(xx,&xv));
330   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
331   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
332   PetscCall(PetscLogGpuTimeEnd());
333   PetscFunctionReturn(0);
334 }
335 
336 /* z = A x + y */
337 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
338 {
339   Mat_SeqAIJKokkos                 *aijkok;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342 
343   PetscFunctionBegin;
344   PetscCall(PetscLogGpuTimeBegin());
345   PetscCall(MatSeqAIJKokkosSyncDevice(A));
346   PetscCall(VecGetKokkosView(xx,&xv));
347   PetscCall(VecGetKokkosView(yy,&yv));
348   PetscCall(VecGetKokkosViewWrite(zz,&zv));
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
351   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
352   PetscCall(VecRestoreKokkosView(xx,&xv));
353   PetscCall(VecRestoreKokkosView(yy,&yv));
354   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
355   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
356   PetscCall(PetscLogGpuTimeEnd());
357   PetscFunctionReturn(0);
358 }
359 
360 /* z = A^T x + y */
361 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
362 {
363   Mat_SeqAIJKokkos                 *aijkok;
364   const char                       *mode;
365   ConstPetscScalarKokkosView       xv,yv;
366   PetscScalarKokkosView            zv;
367   KokkosCsrMatrix                  *csrmat;
368 
369   PetscFunctionBegin;
370   PetscCall(PetscLogGpuTimeBegin());
371   PetscCall(MatSeqAIJKokkosSyncDevice(A));
372   PetscCall(VecGetKokkosView(xx,&xv));
373   PetscCall(VecGetKokkosView(yy,&yv));
374   PetscCall(VecGetKokkosViewWrite(zz,&zv));
375   if (zz != yy) Kokkos::deep_copy(zv,yv);
376   if (A->form_explicit_transpose) {
377     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
378     mode = "N";
379   } else {
380     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
381     csrmat = &aijkok->csrmat;
382     mode = "T";
383   }
384   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
385   PetscCall(VecRestoreKokkosView(xx,&xv));
386   PetscCall(VecRestoreKokkosView(yy,&yv));
387   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
388   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
389   PetscCall(PetscLogGpuTimeEnd());
390   PetscFunctionReturn(0);
391 }
392 
393 /* z = A^H x + y */
394 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
395 {
396   Mat_SeqAIJKokkos                 *aijkok;
397   const char                       *mode;
398   ConstPetscScalarKokkosView       xv,yv;
399   PetscScalarKokkosView            zv;
400   KokkosCsrMatrix                  *csrmat;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscLogGpuTimeBegin());
404   PetscCall(MatSeqAIJKokkosSyncDevice(A));
405   PetscCall(VecGetKokkosView(xx,&xv));
406   PetscCall(VecGetKokkosView(yy,&yv));
407   PetscCall(VecGetKokkosViewWrite(zz,&zv));
408   if (zz != yy) Kokkos::deep_copy(zv,yv);
409   if (A->form_explicit_transpose) {
410     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
411     mode = "N";
412   } else {
413     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
414     csrmat = &aijkok->csrmat;
415     mode = "C";
416   }
417   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
418   PetscCall(VecRestoreKokkosView(xx,&xv));
419   PetscCall(VecRestoreKokkosView(yy,&yv));
420   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
421   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
422   PetscCall(PetscLogGpuTimeEnd());
423   PetscFunctionReturn(0);
424 }
425 
426 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
427 {
428   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
429 
430   PetscFunctionBegin;
431   switch (op) {
432     case MAT_FORM_EXPLICIT_TRANSPOSE:
433       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
434       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
435       A->form_explicit_transpose = flg;
436       break;
437     default:
438       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
439       break;
440   }
441   PetscFunctionReturn(0);
442 }
443 
444 /* Depending on reuse, either build a new mat, or use the existing mat */
445 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
446 {
447   Mat_SeqAIJ       *aseq;
448 
449   PetscFunctionBegin;
450   PetscCall(PetscKokkosInitializeCheck());
451   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
452     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
453   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
454     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
455   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
456     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
457     PetscCall(PetscFree(A->defaultvectype));
458     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
459     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
460     PetscCall(MatSetOps_SeqAIJKokkos(A));
461     aseq = static_cast<Mat_SeqAIJ*>(A->data);
462     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
463       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
464       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
465     }
466   }
467   PetscFunctionReturn(0);
468 }
469 
470 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
471    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
472  */
473 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
474 {
475   Mat_SeqAIJ            *bseq;
476   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
477   Mat                   mat;
478 
479   PetscFunctionBegin;
480   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
481   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
482   mat  = *B;
483   if (A->assembled) {
484     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
485     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
486     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
487     /* Now copy values to B if needed */
488     if (dupOption == MAT_COPY_VALUES) {
489       if (akok->a_dual.need_sync_device()) {
490         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
491         bkok->a_dual.modify_host();
492       } else { /* If device has the latest data, we only copy data on device */
493         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
494         bkok->a_dual.modify_device();
495       }
496     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
497       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
498       bkok->a_dual.modify_host();
499     }
500     mat->spptr = bkok;
501   }
502 
503   PetscCall(PetscFree(mat->defaultvectype));
504   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
505   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
506   PetscCall(MatSetOps_SeqAIJKokkos(mat));
507   PetscFunctionReturn(0);
508 }
509 
510 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
511 {
512   Mat               At;
513   KokkosCsrMatrix   *internT;
514   Mat_SeqAIJKokkos  *atkok,*bkok;
515 
516   PetscFunctionBegin;
517   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
518   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
519     /* Deep copy internT, as we want to isolate the internal transpose */
520     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
521     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
522     if (reuse == MAT_INITIAL_MATRIX) *B = At;
523     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
524   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
525     if ((*B)->assembled) {
526       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
527       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
528       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
529     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
530       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
531       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
532       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
533       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
534       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
535     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
536   }
537   PetscFunctionReturn(0);
538 }
539 
540 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
541 {
542   Mat_SeqAIJKokkos           *aijkok;
543 
544   PetscFunctionBegin;
545   if (A->factortype == MAT_FACTOR_NONE) {
546     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
547     delete aijkok;
548   } else {
549     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
550   }
551   A->spptr = NULL;
552   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
553   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
554   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
555   PetscCall(MatDestroy_SeqAIJ(A));
556   PetscFunctionReturn(0);
557 }
558 
559 /*MC
560    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
561 
562    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
563 
564    Options Database Keys:
565 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
566 
567   Level: beginner
568 
569 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
570 M*/
571 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
572 {
573   PetscFunctionBegin;
574   PetscCall(PetscKokkosInitializeCheck());
575   PetscCall(MatCreate_SeqAIJ(A));
576   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
577   PetscFunctionReturn(0);
578 }
579 
580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582 {
583   Mat_SeqAIJ                   *a,*b;
584   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
585   MatScalarKokkosView          aa,ba,ca;
586   MatRowMapKokkosView          ai,bi,ci;
587   MatColIdxKokkosView          aj,bj,cj;
588   PetscInt                     m,n,nnz,aN;
589 
590   PetscFunctionBegin;
591   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
592   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
593   PetscValidPointer(C,4);
594   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
595   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
596   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
597   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
598 
599   PetscCall(MatSeqAIJKokkosSyncDevice(A));
600   PetscCall(MatSeqAIJKokkosSyncDevice(B));
601   a    = static_cast<Mat_SeqAIJ*>(A->data);
602   b    = static_cast<Mat_SeqAIJ*>(B->data);
603   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
604   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
605   aa   = akok->a_dual.view_device();
606   ai   = akok->i_dual.view_device();
607   ba   = bkok->a_dual.view_device();
608   bi   = bkok->i_dual.view_device();
609   m    = A->rmap->n; /* M, N and nnz of C */
610   n    = A->cmap->n + B->cmap->n;
611   nnz  = a->nz + b->nz;
612   aN   = A->cmap->n; /* N of A */
613   if (reuse == MAT_INITIAL_MATRIX) {
614     aj = akok->j_dual.view_device();
615     bj = bkok->j_dual.view_device();
616     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
617     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
618     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
619     ca = ca_dual.view_device();
620     ci = ci_dual.view_device();
621     cj = cj_dual.view_device();
622 
623     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
624     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
625       PetscInt i = t.league_rank(); /* row i */
626       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
627 
628       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
629         ci(i) = coffset;
630         if (i == m-1) ci(m) = ai(m) + bi(m);
631       });
632 
633       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634         if (k < alen) {
635           ca(coffset+k) = aa(ai(i)+k);
636           cj(coffset+k) = aj(ai(i)+k);
637         } else {
638           ca(coffset+k) = ba(bi(i)+k-alen);
639           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
640         }
641       });
642     });
643     ca_dual.modify_device();
644     ci_dual.modify_device();
645     cj_dual.modify_device();
646     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
647     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
648   } else if (reuse == MAT_REUSE_MATRIX) {
649     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
650     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
651     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
652     ca   = ckok->a_dual.view_device();
653     ci   = ckok->i_dual.view_device();
654 
655     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
656       PetscInt i = t.league_rank(); /* row i */
657       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
658       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
659         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
660         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
661       });
662     });
663     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
664   }
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
669 {
670   PetscFunctionBegin;
671   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
672   PetscFunctionReturn(0);
673 }
674 
675 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
676 {
677   Mat_Product                    *product = C->product;
678   Mat                            A,B;
679   bool                           transA,transB; /* use bool, since KK needs this type */
680   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
681   Mat_SeqAIJ                     *c;
682   MatProductData_SeqAIJKokkos    *pdata;
683   KokkosCsrMatrix                *csrmatA,*csrmatB;
684 
685   PetscFunctionBegin;
686   MatCheckProduct(C,1);
687   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
688   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
689 
690   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
691     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
692     PetscFunctionReturn(0);
693   }
694 
695   switch (product->type) {
696     case MATPRODUCT_AB:  transA = false; transB = false; break;
697     case MATPRODUCT_AtB: transA = true;  transB = false; break;
698     case MATPRODUCT_ABt: transA = false; transB = true;  break;
699     default:
700       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
701   }
702 
703   A     = product->A;
704   B     = product->B;
705   PetscCall(MatSeqAIJKokkosSyncDevice(A));
706   PetscCall(MatSeqAIJKokkosSyncDevice(B));
707   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
709   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
710 
711   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
712 
713   csrmatA = &akok->csrmat;
714   csrmatB = &bkok->csrmat;
715 
716   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
717   if (transA) {
718     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
719     transA = false;
720   }
721 
722   if (transB) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
724     transB = false;
725   }
726   PetscCall(PetscLogGpuTimeBegin());
727   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
728   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
729   PetscCall(PetscLogGpuTimeEnd());
730   PetscCall(MatSeqAIJKokkosModifyDevice(C));
731   /* shorter version of MatAssemblyEnd_SeqAIJ */
732   c = (Mat_SeqAIJ*)C->data;
733   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
734   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
735   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
736   c->reallocs         = 0;
737   C->info.mallocs     = 0;
738   C->info.nz_unneeded = 0;
739   C->assembled        = C->was_assembled = PETSC_TRUE;
740   C->num_ass++;
741   PetscFunctionReturn(0);
742 }
743 
744 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
745 {
746   Mat_Product                    *product = C->product;
747   MatProductType                 ptype;
748   Mat                            A,B;
749   bool                           transA,transB;
750   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
751   MatProductData_SeqAIJKokkos    *pdata;
752   MPI_Comm                       comm;
753   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
754 
755   PetscFunctionBegin;
756   MatCheckProduct(C,1);
757   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
758   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
759   A       = product->A;
760   B       = product->B;
761   PetscCall(MatSeqAIJKokkosSyncDevice(A));
762   PetscCall(MatSeqAIJKokkosSyncDevice(B));
763   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
764   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
765   csrmatA = &akok->csrmat;
766   csrmatB = &bkok->csrmat;
767 
768   ptype   = product->type;
769   switch (ptype) {
770     case MATPRODUCT_AB:  transA = false; transB = false; break;
771     case MATPRODUCT_AtB: transA = true;  transB = false; break;
772     case MATPRODUCT_ABt: transA = false; transB = true;  break;
773     default:
774       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
775   }
776 
777   product->data = pdata = new MatProductData_SeqAIJKokkos();
778   pdata->kh.set_team_work_size(16);
779   pdata->kh.set_dynamic_scheduling(true);
780   pdata->reusesym = product->api_user;
781 
782   /* TODO: add command line options to select spgemm algorithms */
783   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
784   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
785      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
786    */
787   pdata->kh.create_spgemm_handle(spgemm_alg);
788 
789   PetscCall(PetscLogGpuTimeBegin());
790   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
791   if (transA) {
792     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
793     transA = false;
794   }
795 
796   if (transB) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
798     transB = false;
799   }
800 
801   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
802   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
803     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
804     calling new Mat_SeqAIJKokkos().
805     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
806   */
807   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
808   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
809   PetscCall(PetscLogGpuTimeEnd());
810 
811   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
812   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
813   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
814   PetscFunctionReturn(0);
815 }
816 
817 /* handles sparse matrix matrix ops */
818 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
819 {
820   Mat_Product    *product = mat->product;
821   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822 
823   PetscFunctionBegin;
824   MatCheckProduct(mat,1);
825   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
826   if (product->type == MATPRODUCT_ABC) {
827     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
828   }
829   if (Biskok && Ciskok) {
830     switch (product->type) {
831     case MATPRODUCT_AB:
832     case MATPRODUCT_AtB:
833     case MATPRODUCT_ABt:
834       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835       break;
836     case MATPRODUCT_PtAP:
837     case MATPRODUCT_RARt:
838     case MATPRODUCT_ABC:
839       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840       break;
841     default:
842       break;
843     }
844   } else { /* fallback for AIJ */
845     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
846   }
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851 {
852   Mat_SeqAIJKokkos *aijkok;
853 
854   PetscFunctionBegin;
855   PetscCall(PetscLogGpuTimeBegin());
856   PetscCall(MatSeqAIJKokkosSyncDevice(A));
857   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
858   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
859   PetscCall(MatSeqAIJKokkosModifyDevice(A));
860   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
861   PetscCall(PetscLogGpuTimeEnd());
862   PetscFunctionReturn(0);
863 }
864 
865 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
866 {
867   Mat_SeqAIJKokkos *aijkok;
868 
869   PetscFunctionBegin;
870   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
871   if (aijkok) { /* Only zero the device if data is already there */
872     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
873     PetscCall(MatSeqAIJKokkosModifyDevice(A));
874   } else { /* Might be preallocated but not assembled */
875     PetscCall(MatZeroEntries_SeqAIJ(A));
876   }
877   PetscFunctionReturn(0);
878 }
879 
880 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
881 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
882 {
883   Mat_SeqAIJKokkos   *aijkok;
884 
885   PetscFunctionBegin;
886   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887   PetscValidPointer(kv,2);
888   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891   *kv    = aijkok->a_dual.view_device();
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
896 {
897   PetscFunctionBegin;
898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899   PetscValidPointer(kv,2);
900   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901   PetscFunctionReturn(0);
902 }
903 
904 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
905 {
906   Mat_SeqAIJKokkos   *aijkok;
907 
908   PetscFunctionBegin;
909   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
910   PetscValidPointer(kv,2);
911   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
912   PetscCall(MatSeqAIJKokkosSyncDevice(A));
913   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
914   *kv    = aijkok->a_dual.view_device();
915   PetscFunctionReturn(0);
916 }
917 
918 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
919 {
920   PetscFunctionBegin;
921   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
922   PetscValidPointer(kv,2);
923   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
924   PetscCall(MatSeqAIJKokkosModifyDevice(A));
925   PetscFunctionReturn(0);
926 }
927 
928 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
929 {
930   Mat_SeqAIJKokkos   *aijkok;
931 
932   PetscFunctionBegin;
933   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
934   PetscValidPointer(kv,2);
935   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
936   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
937   *kv    = aijkok->a_dual.view_device();
938   PetscFunctionReturn(0);
939 }
940 
941 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
942 {
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   PetscCall(MatSeqAIJKokkosModifyDevice(A));
948   PetscFunctionReturn(0);
949 }
950 
951 /* Computes Y += alpha X */
952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953 {
954   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
955   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
956   ConstMatScalarKokkosView   Xa;
957   MatScalarKokkosView        Ya;
958 
959   PetscFunctionBegin;
960   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
961   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
962   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
963   PetscCall(MatSeqAIJKokkosSyncDevice(X));
964   PetscCall(PetscLogGpuTimeBegin());
965 
966   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
967     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
968     PetscBool e;
969     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
970     if (e) {
971       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
972       if (e) pattern = SAME_NONZERO_PATTERN;
973     }
974   }
975 
976   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
977     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
978     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
979     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
980   */
981   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
982   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
983   Xa   = xkok->a_dual.view_device();
984   Ya   = ykok->a_dual.view_device();
985 
986   if (pattern == SAME_NONZERO_PATTERN) {
987     KokkosBlas::axpy(alpha,Xa,Ya);
988     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
989   } else if (pattern == SUBSET_NONZERO_PATTERN) {
990     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
991     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
992 
993     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
994       PetscInt i = t.league_rank(); /* row i */
995       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
996         PetscInt p,q = Yi(i);
997         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
998           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
999           if (Xj(p) == Yj(q)) { /* Found it */
1000             Ya(q) += alpha * Xa(p);
1001             q++;
1002           } else {
1003             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1004                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1005             */
1006             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1007           }
1008         }
1009       });
1010     });
1011     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1012   } else { /* different nonzero patterns */
1013     Mat             Z;
1014     KokkosCsrMatrix zcsr;
1015     KernelHandle    kh;
1016     kh.create_spadd_handle(false);
1017     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1018     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1019     zkok = new Mat_SeqAIJKokkos(zcsr);
1020     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1021     PetscCall(MatHeaderReplace(Y,&Z));
1022     kh.destroy_spadd_handle();
1023   }
1024   PetscCall(PetscLogGpuTimeEnd());
1025   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1030 {
1031   Mat_SeqAIJKokkos *akok;
1032   Mat_SeqAIJ       *aseq;
1033 
1034   PetscFunctionBegin;
1035   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1036   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1037   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1038   delete akok;
1039   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1040   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1041   akok->SetUpCOO(aseq);
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1046 {
1047   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1048   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1049   PetscCount                  Annz = aseq->nz;
1050   const PetscCountKokkosView& jmap = akok->jmap_d;
1051   const PetscCountKokkosView& perm = akok->perm_d;
1052   MatScalarKokkosView         Aa;
1053   ConstMatScalarKokkosView    kv;
1054   PetscMemType                memtype;
1055 
1056   PetscFunctionBegin;
1057   PetscCall(PetscGetMemType(v,&memtype));
1058   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1059     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1060   } else {
1061     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1062   }
1063 
1064   if (imode == INSERT_VALUES) {
1065     PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1066     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1067   } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1068 
1069   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1070 
1071   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1072   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1077 {
1078   PetscFunctionBegin;
1079   PetscCall(MatSeqAIJKokkosSyncHost(A));
1080   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1081   B->offloadmask = PETSC_OFFLOAD_CPU;
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1086 {
1087   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1088 
1089   PetscFunctionBegin;
1090   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1091   A->boundtocpu  = PETSC_FALSE;
1092 
1093   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1094   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1095   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1096   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1097   A->ops->scale                     = MatScale_SeqAIJKokkos;
1098   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1099   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1100   A->ops->mult                      = MatMult_SeqAIJKokkos;
1101   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1102   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1103   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1104   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1105   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1106   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1107   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1108   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1109   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1110   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1111   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1112   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1113   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1114   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1115   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1116 
1117   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1118   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1123 {
1124   Mat_SeqAIJ *aseq;
1125   PetscInt    i,m,n;
1126 
1127   PetscFunctionBegin;
1128   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1129 
1130   m = akok->nrows();
1131   n = akok->ncols();
1132   PetscCall(MatSetSizes(A,m,n,m,n));
1133   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1134 
1135   /* Set up data structures of A as a MATSEQAIJ */
1136   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1137   aseq = (Mat_SeqAIJ*)(A)->data;
1138 
1139   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1140   akok->j_dual.sync_host();
1141 
1142   aseq->i            = akok->i_host_data();
1143   aseq->j            = akok->j_host_data();
1144   aseq->a            = akok->a_host_data();
1145   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1146   aseq->singlemalloc = PETSC_FALSE;
1147   aseq->free_a       = PETSC_FALSE;
1148   aseq->free_ij      = PETSC_FALSE;
1149   aseq->nz           = akok->nnz();
1150   aseq->maxnz        = aseq->nz;
1151 
1152   PetscCall(PetscMalloc1(m,&aseq->imax));
1153   PetscCall(PetscMalloc1(m,&aseq->ilen));
1154   for (i=0; i<m; i++) {
1155     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1156   }
1157 
1158   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1159   akok->nonzerostate = A->nonzerostate;
1160   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1161   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1162   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1167 
1168    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1169  */
1170 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1171 {
1172   PetscFunctionBegin;
1173   PetscCall(MatCreate(comm,A));
1174   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1175   PetscFunctionReturn(0);
1176 }
1177 
1178 /* --------------------------------------------------------------------------------*/
1179 /*@C
1180    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1181    (the default parallel PETSc format). This matrix will ultimately be handled by
1182    Kokkos for calculations. For good matrix
1183    assembly performance the user should preallocate the matrix storage by setting
1184    the parameter nz (or the array nnz).  By setting these parameters accurately,
1185    performance during matrix assembly can be increased by more than a factor of 50.
1186 
1187    Collective
1188 
1189    Input Parameters:
1190 +  comm - MPI communicator, set to PETSC_COMM_SELF
1191 .  m - number of rows
1192 .  n - number of columns
1193 .  nz - number of nonzeros per row (same for all rows)
1194 -  nnz - array containing the number of nonzeros in the various rows
1195          (possibly different for each row) or NULL
1196 
1197    Output Parameter:
1198 .  A - the matrix
1199 
1200    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1201    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1202    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1203 
1204    Notes:
1205    If nnz is given then nz is ignored
1206 
1207    The AIJ format (also called the Yale sparse matrix format or
1208    compressed row storage), is fully compatible with standard Fortran 77
1209    storage.  That is, the stored row and column indices can begin at
1210    either one (as in Fortran) or zero.  See the users' manual for details.
1211 
1212    Specify the preallocated storage with either nz or nnz (not both).
1213    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1214    allocation.  For large problems you MUST preallocate memory or you
1215    will get TERRIBLE performance, see the users' manual chapter on matrices.
1216 
1217    By default, this format uses inodes (identical nodes) when possible, to
1218    improve numerical efficiency of matrix-vector products and solves. We
1219    search for consecutive rows with the same nonzero structure, thereby
1220    reusing matrix information to achieve increased efficiency.
1221 
1222    Level: intermediate
1223 
1224 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1225 @*/
1226 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1227 {
1228   PetscFunctionBegin;
1229   PetscCall(PetscKokkosInitializeCheck());
1230   PetscCall(MatCreate(comm,A));
1231   PetscCall(MatSetSizes(*A,m,n,m,n));
1232   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1233   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1234   PetscFunctionReturn(0);
1235 }
1236 
1237 typedef Kokkos::TeamPolicy<>::member_type team_member;
1238 //
1239 // This factorization exploits block diagonal matrices with "Nf" (not used).
1240 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1241 //
1242 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1243 {
1244   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1245   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1246   IS                 isrow = b->row,isicol = b->icol;
1247   const PetscInt     *r_h,*ic_h;
1248   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1249   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1250   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1251   PetscBool          row_identity,col_identity;
1252   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1253 
1254   PetscFunctionBegin;
1255   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1256   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1257   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1258   PetscCall(ISGetIndices(isrow,&r_h));
1259   PetscCall(ISGetIndices(isicol,&ic_h));
1260   PetscCall(ISGetSize(isicol,&nc));
1261   PetscCall(PetscLogGpuTimeBegin());
1262   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1263   {
1264 #define KOKKOS_SHARED_LEVEL 1
1265     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1266     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1267     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1268     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1269     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1270     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1271     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1272     size_t flops_h = 0.0;
1273     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1274     Kokkos::View<size_t> d_flops_k ("flops");
1275     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1276     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1277     Kokkos::deep_copy (d_flops_k, h_flops_k);
1278     Kokkos::deep_copy (d_r_k, h_r_k);
1279     Kokkos::deep_copy (d_ic_k, h_ic_k);
1280     // Fill A --> fact
1281     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1282         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1283         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1284         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1285         // zero rows of B
1286         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1287             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1288             PetscScalar *baL = ba_d + bi_d[rowb];
1289             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1290             /* zero (unfactored row) */
1291             for (int j=0;j<nzbL;j++) baL[j] = 0;
1292             for (int j=0;j<nzbU;j++) baU[j] = 0;
1293           });
1294         // copy A into B
1295         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1296             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1297             const PetscScalar *av    = aa_d + ai_d[rowa];
1298             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1299             /* load in initial (unfactored row) */
1300             for (int j=0;j<nza;j++) {
1301               PetscInt    colb = ic[ajtmp[j]];
1302               PetscScalar vala = av[j];
1303               if (colb == rowb) {
1304                 *(ba_d + bdiag_d[rowb]) = vala;
1305               } else {
1306                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1307                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1308                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1309                 for (int j=0; j<nz ; j++) {
1310                   if (pbj[j] == colb) {
1311                     pba[j] = vala;
1312                     set++;
1313                     break;
1314                   }
1315                 }
1316                #if !defined(PETSC_HAVE_SYCL)
1317                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1318                #endif
1319               }
1320             }
1321           });
1322       });
1323     Kokkos::fence();
1324 
1325     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1326         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1327         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1328         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1329         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1330         const PetscInt  start = field*nloc, end = start + nloc;
1331         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1332         // A22 panel update for each row A(1,:) and col A(:,1)
1333         for (int ii=start; ii<end-1; ii++) {
1334           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1335           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1336           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1337           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1338           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1339               PetscInt kIdx = j*Ni + field_block_idx;
1340               if (kIdx >= nzUi) /* void */ ;
1341               else {
1342                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1343                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1344                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1345                 size_t         st_idx;
1346                 // find and do L(k,i) = A(:k,i) / A(i,i)
1347                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1348                 // get column, there has got to be a better way
1349                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1350                     if (pjL[j] == ii) {
1351                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1352                       idx = j; // output
1353                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1354                     }
1355                 }, st_idx);
1356                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1357 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1358                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1359 #endif
1360                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1361                 // U(i+1,:end)
1362                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1363                       PetscScalar Uij = baUi[uiIdx];
1364                       PetscInt    col = bjUi[uiIdx];
1365                       if (col==myk) {
1366                         // A_kk = A_kk - L_ki * U_ij(k)
1367                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1368                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1369                       } else {
1370                         PetscScalar    *start, *end, *pAkjv=NULL;
1371                         PetscInt       high, low;
1372                         const PetscInt *startj;
1373                         if (col<myk) { // L
1374                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1375                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1376                           start = pLki+1; // start at pLki+1, A22(myk,1)
1377                           startj= bj_d + bi_d[myk] + idx;
1378                           end   = ba_d + bi_d[myk+1];
1379                         } else {
1380                           PetscInt idx = bdiag_d[myk+1]+1;
1381                           start = ba_d + idx;
1382                           startj= bj_d + idx;
1383                           end   = ba_d + bdiag_d[myk];
1384                         }
1385                         // search for 'col', use bisection search - TODO
1386                         low  = 0;
1387                         high = (PetscInt)(end-start);
1388                         while (high-low > 5) {
1389                           int t = (low+high)/2;
1390                           if (startj[t] > col) high = t;
1391                           else                 low  = t;
1392                         }
1393                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1394                           if (startj[pAkjv-start] == col) break;
1395                         }
1396 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1397                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1398 #endif
1399                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1400                       }
1401                     });
1402               }
1403             });
1404           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1405           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1406         } /* endof for (i=0; i<n; i++) { */
1407         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1408       });
1409     Kokkos::fence();
1410     Kokkos::deep_copy (h_flops_k, d_flops_k);
1411     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1412     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1413         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1414         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1415         /* Invert diagonal for simpler triangular solves */
1416         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1417             int i = start + outer_index*Ni + lg_rank%Ni;
1418             if (i < end) {
1419               PetscScalar *pv = ba_d + bdiag_d[i];
1420               *pv = 1.0/(*pv);
1421             }
1422           });
1423       });
1424   }
1425   PetscCall(PetscLogGpuTimeEnd());
1426   PetscCall(ISRestoreIndices(isicol,&ic_h));
1427   PetscCall(ISRestoreIndices(isrow,&r_h));
1428 
1429   PetscCall(ISIdentity(isrow,&row_identity));
1430   PetscCall(ISIdentity(isicol,&col_identity));
1431   if (b->inode.size) {
1432     B->ops->solve = MatSolve_SeqAIJ_Inode;
1433   } else if (row_identity && col_identity) {
1434     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1435   } else {
1436     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1437   }
1438   B->offloadmask = PETSC_OFFLOAD_GPU;
1439   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1440   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1441   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1442   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1443   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1444   B->assembled              = PETSC_TRUE;
1445   B->preallocated           = PETSC_TRUE;
1446 
1447   PetscFunctionReturn(0);
1448 }
1449 
1450 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1451 {
1452   PetscFunctionBegin;
1453   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1454   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1459 {
1460   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1461 
1462   PetscFunctionBegin;
1463   if (!factors->sptrsv_symbolic_completed) {
1464     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1465     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1466     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1467   }
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 /* Check if we need to update factors etc for transpose solve */
1472 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1473 {
1474   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1475   MatColIdxType             n = A->rmap->n;
1476 
1477   PetscFunctionBegin;
1478   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1479     /* Update L^T and do sptrsv symbolic */
1480     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1481     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1482     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1483     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1484 
1485     KokkosKernels::Impl::transpose_matrix<
1486       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1487       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1488       MatRowMapKokkosView,DefaultExecutionSpace>(
1489         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1490         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1491 
1492     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1493       We have to sort the indices, until KK provides finer control options.
1494     */
1495     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1496       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1497         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1498 
1499     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1500 
1501     /* Update U^T and do sptrsv symbolic */
1502     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1503     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1504     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1505     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1506 
1507     KokkosKernels::Impl::transpose_matrix<
1508       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1509       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1510       MatRowMapKokkosView,DefaultExecutionSpace>(
1511         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1512         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1513 
1514     /* Sort indices. See comments above */
1515     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1516       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1517         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1518 
1519     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1520     factors->transpose_updated = PETSC_TRUE;
1521   }
1522   PetscFunctionReturn(0);
1523 }
1524 
1525 /* Solve Ax = b, with A = LU */
1526 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1527 {
1528   ConstPetscScalarKokkosView     bv;
1529   PetscScalarKokkosView          xv;
1530   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1531 
1532   PetscFunctionBegin;
1533   PetscCall(PetscLogGpuTimeBegin());
1534   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1535   PetscCall(VecGetKokkosView(b,&bv));
1536   PetscCall(VecGetKokkosViewWrite(x,&xv));
1537   /* Solve L tmpv = b */
1538   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1539   /* Solve Ux = tmpv */
1540   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1541   PetscCall(VecRestoreKokkosView(b,&bv));
1542   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1543   PetscCall(PetscLogGpuTimeEnd());
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 /* Solve A^T x = b, where A^T = U^T L^T */
1548 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1549 {
1550   ConstPetscScalarKokkosView     bv;
1551   PetscScalarKokkosView          xv;
1552   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1553 
1554   PetscFunctionBegin;
1555   PetscCall(PetscLogGpuTimeBegin());
1556   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1557   PetscCall(VecGetKokkosView(b,&bv));
1558   PetscCall(VecGetKokkosViewWrite(x,&xv));
1559   /* Solve U^T tmpv = b */
1560   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1561 
1562   /* Solve L^T x = tmpv */
1563   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1564   PetscCall(VecRestoreKokkosView(b,&bv));
1565   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1566   PetscCall(PetscLogGpuTimeEnd());
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1571 {
1572   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1573   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1574   PetscInt                       fill_lev = info->levels;
1575 
1576   PetscFunctionBegin;
1577   PetscCall(PetscLogGpuTimeBegin());
1578   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1579 
1580   auto a_d = aijkok->a_dual.view_device();
1581   auto i_d = aijkok->i_dual.view_device();
1582   auto j_d = aijkok->j_dual.view_device();
1583 
1584   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1585 
1586   B->assembled                       = PETSC_TRUE;
1587   B->preallocated                    = PETSC_TRUE;
1588   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1589   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1590   B->ops->matsolve                   = NULL;
1591   B->ops->matsolvetranspose          = NULL;
1592   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1593 
1594   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1595   factors->transpose_updated         = PETSC_FALSE;
1596   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1597   /* TODO: log flops, but how to know that? */
1598   PetscCall(PetscLogGpuTimeEnd());
1599   PetscFunctionReturn(0);
1600 }
1601 
1602 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1603 {
1604   Mat_SeqAIJKokkos               *aijkok;
1605   Mat_SeqAIJ                     *b;
1606   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1607   PetscInt                       fill_lev = info->levels;
1608   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1609   PetscInt                       n = A->rmap->n;
1610 
1611   PetscFunctionBegin;
1612   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1613   /* Rebuild factors */
1614   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1615   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1616 
1617   /* Create a spiluk handle and then do symbolic factorization */
1618   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1619   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1620 
1621   auto spiluk_handle = factors->kh.get_spiluk_handle();
1622 
1623   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1624   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1625   Kokkos::realloc(factors->iU_d,n+1);
1626   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1627 
1628   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1629   auto i_d = aijkok->i_dual.view_device();
1630   auto j_d = aijkok->j_dual.view_device();
1631   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1632   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1633 
1634   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1635   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1636   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1637   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1638 
1639   /* TODO: add options to select sptrsv algorithms */
1640   /* Create sptrsv handles for L, U and their transpose */
1641  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1642   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1643  #else
1644   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1645  #endif
1646 
1647   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1648   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1649   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1650   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1651 
1652   /* Fill fields of the factor matrix B */
1653   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1654   b     = (Mat_SeqAIJ*)B->data;
1655   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1656   B->info.fill_ratio_given  = info->fill;
1657   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1658 
1659   B->offloadmask            = PETSC_OFFLOAD_GPU;
1660   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1661   PetscFunctionReturn(0);
1662 }
1663 
1664 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1665 {
1666   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1667   const PetscInt   nrows   = A->rmap->n;
1668 
1669   PetscFunctionBegin;
1670   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1671   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1672   // move B data into Kokkos
1673   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1674   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1675   {
1676     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1677     if (!baijkok->diag_d.extent(0)) {
1678       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1679       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1680       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1681     }
1682   }
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1687 {
1688   PetscFunctionBegin;
1689   *type = MATSOLVERKOKKOS;
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1694 {
1695   PetscFunctionBegin;
1696   *type = MATSOLVERKOKKOSDEVICE;
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 /*MC
1701   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1702   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1703 
1704   Level: beginner
1705 
1706 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1707 M*/
1708 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1709 {
1710   PetscInt       n = A->rmap->n;
1711 
1712   PetscFunctionBegin;
1713   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1714   PetscCall(MatSetSizes(*B,n,n,n,n));
1715   (*B)->factortype = ftype;
1716   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1717   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1718 
1719   if (ftype == MAT_FACTOR_LU) {
1720     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1721     (*B)->canuseordering         = PETSC_TRUE;
1722     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1723   } else if (ftype == MAT_FACTOR_ILU) {
1724     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1725     (*B)->canuseordering         = PETSC_FALSE;
1726     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1727   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1728 
1729   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1730   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1735 {
1736   PetscInt       n = A->rmap->n;
1737 
1738   PetscFunctionBegin;
1739   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1740   PetscCall(MatSetSizes(*B,n,n,n,n));
1741   (*B)->factortype = ftype;
1742   (*B)->canuseordering = PETSC_TRUE;
1743   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1744   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1745 
1746   if (ftype == MAT_FACTOR_LU) {
1747     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1748     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1749   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1750 
1751   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1752   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1757 {
1758   PetscFunctionBegin;
1759   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1760   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1761   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 /* Utility to print out a KokkosCsrMatrix for debugging */
1766 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1767 {
1768   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1769   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1770   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1771   const PetscInt    *i = iv.data();
1772   const PetscInt    *j = jv.data();
1773   const PetscScalar *a = av.data();
1774   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1775 
1776   PetscFunctionBegin;
1777   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1778   for (PetscInt k=0; k<m; k++) {
1779     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1780     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1781       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]));
1782     }
1783     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1784   }
1785   PetscFunctionReturn(0);
1786 }
1787