xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
105     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
106     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
107     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
108   */
109   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110     aijkok->a_dual.sync_host();
111     *array = aijkok->a_dual.view_host().data();
112   } else { /* Happens when calling MatSetValues on a newly created matrix */
113     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114   }
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119 {
120   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121 
122   PetscFunctionBegin;
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     aijkok->a_dual.sync_host();
134     *array = aijkok->a_dual.view_host().data();
135   } else {
136     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142 {
143   PetscFunctionBegin;
144   *array = NULL;
145   PetscFunctionReturn(0);
146 }
147 
148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149 {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162 {
163   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164 
165   PetscFunctionBegin;
166   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167     aijkok->a_dual.clear_sync_state();
168     aijkok->a_dual.modify_host();
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
174 {
175   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176 
177   PetscFunctionBegin;
178   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
179 
180   if (i) *i = aijkok->i_device_data();
181   if (j) *j = aijkok->j_device_data();
182   if (a) {
183     aijkok->a_dual.sync_device();
184     *a = aijkok->a_device_data();
185   }
186   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
187   PetscFunctionReturn(0);
188 }
189 
190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192 {
193   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195 
196   PetscFunctionBegin;
197   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200   PetscFunctionReturn(0);
201 }
202 
203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && aijkok->device_mat_d.data()) {
210     *d_mat = aijkok->device_mat_d.data();
211   } else {
212     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213     *d_mat  = NULL;
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the transpose on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
227     PetscCallCXX(aijkok->a_dual.sync_device());
228     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
230     aijkok->transpose_updated = PETSC_TRUE;
231   }
232   *csrmatT = &aijkok->csrmatT;
233   PetscFunctionReturn(0);
234 }
235 
236 /* Generate the Hermitian on device and cache it internally */
237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238 {
239   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240 
241   PetscFunctionBegin;
242   PetscCall(PetscLogGpuTimeBegin());
243   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
245     PetscCallCXX(aijkok->a_dual.sync_device());
246     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
247     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248    #if defined(PETSC_USE_COMPLEX)
249     const auto& a = aijkok->csrmatH.values;
250     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251    #endif
252     aijkok->hermitian_updated = PETSC_TRUE;
253   }
254   *csrmatH = &aijkok->csrmatH;
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A x */
260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   ConstPetscScalarKokkosView       xv;
264   PetscScalarKokkosView            yv;
265 
266   PetscFunctionBegin;
267   PetscCall(PetscLogGpuTimeBegin());
268   PetscCall(MatSeqAIJKokkosSyncDevice(A));
269   PetscCall(VecGetKokkosView(xx,&xv));
270   PetscCall(VecGetKokkosViewWrite(yy,&yv));
271   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
273   PetscCall(VecRestoreKokkosView(xx,&xv));
274   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
276   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
277   PetscCall(PetscLogGpuTimeEnd());
278   PetscFunctionReturn(0);
279 }
280 
281 /* y = A^T x */
282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   PetscCall(PetscLogGpuTimeBegin());
292   PetscCall(MatSeqAIJKokkosSyncDevice(A));
293   PetscCall(VecGetKokkosView(xx,&xv));
294   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295   if (A->form_explicit_transpose) {
296     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297     mode = "N";
298   } else {
299     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300     csrmat = &aijkok->csrmat;
301     mode = "T";
302   }
303   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
304   PetscCall(VecRestoreKokkosView(xx,&xv));
305   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
306   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
307   PetscCall(PetscLogGpuTimeEnd());
308   PetscFunctionReturn(0);
309 }
310 
311 /* y = A^H x */
312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqAIJKokkos                 *aijkok;
315   const char                       *mode;
316   ConstPetscScalarKokkosView       xv;
317   PetscScalarKokkosView            yv;
318   KokkosCsrMatrix                  *csrmat;
319 
320   PetscFunctionBegin;
321   PetscCall(PetscLogGpuTimeBegin());
322   PetscCall(MatSeqAIJKokkosSyncDevice(A));
323   PetscCall(VecGetKokkosView(xx,&xv));
324   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325   if (A->form_explicit_transpose) {
326     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327     mode = "N";
328   } else {
329     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330     csrmat = &aijkok->csrmat;
331     mode = "C";
332   }
333   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
334   PetscCall(VecRestoreKokkosView(xx,&xv));
335   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
336   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
337   PetscCall(PetscLogGpuTimeEnd());
338   PetscFunctionReturn(0);
339 }
340 
341 /* z = A x + y */
342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
343 {
344   Mat_SeqAIJKokkos                 *aijkok;
345   ConstPetscScalarKokkosView       xv,yv;
346   PetscScalarKokkosView            zv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx,&xv));
352   PetscCall(VecGetKokkosView(yy,&yv));
353   PetscCall(VecGetKokkosViewWrite(zz,&zv));
354   if (zz != yy) Kokkos::deep_copy(zv,yv);
355   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
357   PetscCall(VecRestoreKokkosView(xx,&xv));
358   PetscCall(VecRestoreKokkosView(yy,&yv));
359   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
360   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
361   PetscCall(PetscLogGpuTimeEnd());
362   PetscFunctionReturn(0);
363 }
364 
365 /* z = A^T x + y */
366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
367 {
368   Mat_SeqAIJKokkos                 *aijkok;
369   const char                       *mode;
370   ConstPetscScalarKokkosView       xv,yv;
371   PetscScalarKokkosView            zv;
372   KokkosCsrMatrix                  *csrmat;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx,&xv));
378   PetscCall(VecGetKokkosView(yy,&yv));
379   PetscCall(VecGetKokkosViewWrite(zz,&zv));
380   if (zz != yy) Kokkos::deep_copy(zv,yv);
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386     csrmat = &aijkok->csrmat;
387     mode = "T";
388   }
389   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
390   PetscCall(VecRestoreKokkosView(xx,&xv));
391   PetscCall(VecRestoreKokkosView(yy,&yv));
392   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
393   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
394   PetscCall(PetscLogGpuTimeEnd());
395   PetscFunctionReturn(0);
396 }
397 
398 /* z = A^H x + y */
399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
400 {
401   Mat_SeqAIJKokkos                 *aijkok;
402   const char                       *mode;
403   ConstPetscScalarKokkosView       xv,yv;
404   PetscScalarKokkosView            zv;
405   KokkosCsrMatrix                  *csrmat;
406 
407   PetscFunctionBegin;
408   PetscCall(PetscLogGpuTimeBegin());
409   PetscCall(MatSeqAIJKokkosSyncDevice(A));
410   PetscCall(VecGetKokkosView(xx,&xv));
411   PetscCall(VecGetKokkosView(yy,&yv));
412   PetscCall(VecGetKokkosViewWrite(zz,&zv));
413   if (zz != yy) Kokkos::deep_copy(zv,yv);
414   if (A->form_explicit_transpose) {
415     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416     mode = "N";
417   } else {
418     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419     csrmat = &aijkok->csrmat;
420     mode = "C";
421   }
422   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
423   PetscCall(VecRestoreKokkosView(xx,&xv));
424   PetscCall(VecRestoreKokkosView(yy,&yv));
425   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
426   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
427   PetscCall(PetscLogGpuTimeEnd());
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432 {
433   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434 
435   PetscFunctionBegin;
436   switch (op) {
437     case MAT_FORM_EXPLICIT_TRANSPOSE:
438       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
439       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440       A->form_explicit_transpose = flg;
441       break;
442     default:
443       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444       break;
445   }
446   PetscFunctionReturn(0);
447 }
448 
449 /* Depending on reuse, either build a new mat, or use the existing mat */
450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
451 {
452   Mat_SeqAIJ       *aseq;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscKokkosInitializeCheck());
456   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
457     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
458   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
459     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
461     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
462     PetscCall(PetscFree(A->defaultvectype));
463     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
464     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
465     PetscCall(MatSetOps_SeqAIJKokkos(A));
466     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
468       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
470     }
471   }
472   PetscFunctionReturn(0);
473 }
474 
475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477  */
478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
479 {
480   Mat_SeqAIJ            *bseq;
481   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482   Mat                   mat;
483 
484   PetscFunctionBegin;
485   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
486   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487   mat  = *B;
488   if (A->assembled) {
489     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492     /* Now copy values to B if needed */
493     if (dupOption == MAT_COPY_VALUES) {
494       if (akok->a_dual.need_sync_device()) {
495         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496         bkok->a_dual.modify_host();
497       } else { /* If device has the latest data, we only copy data on device */
498         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499         bkok->a_dual.modify_device();
500       }
501     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503       bkok->a_dual.modify_host();
504     }
505     mat->spptr = bkok;
506   }
507 
508   PetscCall(PetscFree(mat->defaultvectype));
509   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
510   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
511   PetscCall(MatSetOps_SeqAIJKokkos(mat));
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
516 {
517   Mat               At;
518   KokkosCsrMatrix   *internT;
519   Mat_SeqAIJKokkos  *atkok,*bkok;
520 
521   PetscFunctionBegin;
522   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
523   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
524     /* Deep copy internT, as we want to isolate the internal transpose */
525     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
526     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
527     if (reuse == MAT_INITIAL_MATRIX) *B = At;
528     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
529   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
530     if ((*B)->assembled) {
531       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
532       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
533       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
534     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
535       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
536       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
537       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
538       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
539       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
540     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
546 {
547   Mat_SeqAIJKokkos           *aijkok;
548 
549   PetscFunctionBegin;
550   if (A->factortype == MAT_FACTOR_NONE) {
551     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   A->spptr = NULL;
557   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
558   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
559   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
560   PetscCall(MatDestroy_SeqAIJ(A));
561   PetscFunctionReturn(0);
562 }
563 
564 /*MC
565    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
566 
567    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
568 
569    Options Database Keys:
570 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
571 
572   Level: beginner
573 
574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
575 M*/
576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
577 {
578   PetscFunctionBegin;
579   PetscCall(PetscKokkosInitializeCheck());
580   PetscCall(MatCreate_SeqAIJ(A));
581   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
582   PetscFunctionReturn(0);
583 }
584 
585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
587 {
588   Mat_SeqAIJ                   *a,*b;
589   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
590   MatScalarKokkosView          aa,ba,ca;
591   MatRowMapKokkosView          ai,bi,ci;
592   MatColIdxKokkosView          aj,bj,cj;
593   PetscInt                     m,n,nnz,aN;
594 
595   PetscFunctionBegin;
596   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
597   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
598   PetscValidPointer(C,4);
599   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
600   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
601   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
602   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
603 
604   PetscCall(MatSeqAIJKokkosSyncDevice(A));
605   PetscCall(MatSeqAIJKokkosSyncDevice(B));
606   a    = static_cast<Mat_SeqAIJ*>(A->data);
607   b    = static_cast<Mat_SeqAIJ*>(B->data);
608   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
609   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
610   aa   = akok->a_dual.view_device();
611   ai   = akok->i_dual.view_device();
612   ba   = bkok->a_dual.view_device();
613   bi   = bkok->i_dual.view_device();
614   m    = A->rmap->n; /* M, N and nnz of C */
615   n    = A->cmap->n + B->cmap->n;
616   nnz  = a->nz + b->nz;
617   aN   = A->cmap->n; /* N of A */
618   if (reuse == MAT_INITIAL_MATRIX) {
619     aj = akok->j_dual.view_device();
620     bj = bkok->j_dual.view_device();
621     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
622     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
623     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
624     ca = ca_dual.view_device();
625     ci = ci_dual.view_device();
626     cj = cj_dual.view_device();
627 
628     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
629     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
630       PetscInt i = t.league_rank(); /* row i */
631       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
632 
633       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
634         ci(i) = coffset;
635         if (i == m-1) ci(m) = ai(m) + bi(m);
636       });
637 
638       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
639         if (k < alen) {
640           ca(coffset+k) = aa(ai(i)+k);
641           cj(coffset+k) = aj(ai(i)+k);
642         } else {
643           ca(coffset+k) = ba(bi(i)+k-alen);
644           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
645         }
646       });
647     });
648     ca_dual.modify_device();
649     ci_dual.modify_device();
650     cj_dual.modify_device();
651     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
652     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
653   } else if (reuse == MAT_REUSE_MATRIX) {
654     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
655     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
656     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
657     ca   = ckok->a_dual.view_device();
658     ci   = ckok->i_dual.view_device();
659 
660     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
661       PetscInt i = t.league_rank(); /* row i */
662       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
663       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
664         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
665         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
666       });
667     });
668     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
669   }
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
674 {
675   PetscFunctionBegin;
676   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
681 {
682   Mat_Product                    *product = C->product;
683   Mat                            A,B;
684   bool                           transA,transB; /* use bool, since KK needs this type */
685   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
686   Mat_SeqAIJ                     *c;
687   MatProductData_SeqAIJKokkos    *pdata;
688   KokkosCsrMatrix                *csrmatA,*csrmatB;
689 
690   PetscFunctionBegin;
691   MatCheckProduct(C,1);
692   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
693   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
694 
695   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
696     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
697     PetscFunctionReturn(0);
698   }
699 
700   switch (product->type) {
701     case MATPRODUCT_AB:  transA = false; transB = false; break;
702     case MATPRODUCT_AtB: transA = true;  transB = false; break;
703     case MATPRODUCT_ABt: transA = false; transB = true;  break;
704     default:
705       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
706   }
707 
708   A     = product->A;
709   B     = product->B;
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
713   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
714   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
715 
716   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
717 
718   csrmatA = &akok->csrmat;
719   csrmatB = &bkok->csrmat;
720 
721   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
722   if (transA) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
724     transA = false;
725   }
726 
727   if (transB) {
728     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
729     transB = false;
730   }
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
733   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
734   PetscCall(PetscLogGpuTimeEnd());
735   PetscCall(MatSeqAIJKokkosModifyDevice(C));
736   /* shorter version of MatAssemblyEnd_SeqAIJ */
737   c = (Mat_SeqAIJ*)C->data;
738   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
739   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
740   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
741   c->reallocs         = 0;
742   C->info.mallocs     = 0;
743   C->info.nz_unneeded = 0;
744   C->assembled        = C->was_assembled = PETSC_TRUE;
745   C->num_ass++;
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
750 {
751   Mat_Product                    *product = C->product;
752   MatProductType                 ptype;
753   Mat                            A,B;
754   bool                           transA,transB;
755   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
756   MatProductData_SeqAIJKokkos    *pdata;
757   MPI_Comm                       comm;
758   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
759 
760   PetscFunctionBegin;
761   MatCheckProduct(C,1);
762   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
763   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
764   A       = product->A;
765   B       = product->B;
766   PetscCall(MatSeqAIJKokkosSyncDevice(A));
767   PetscCall(MatSeqAIJKokkosSyncDevice(B));
768   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
769   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
770   csrmatA = &akok->csrmat;
771   csrmatB = &bkok->csrmat;
772 
773   ptype   = product->type;
774   switch (ptype) {
775     case MATPRODUCT_AB:  transA = false; transB = false; break;
776     case MATPRODUCT_AtB: transA = true;  transB = false; break;
777     case MATPRODUCT_ABt: transA = false; transB = true;  break;
778     default:
779       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
780   }
781 
782   product->data = pdata = new MatProductData_SeqAIJKokkos();
783   pdata->kh.set_team_work_size(16);
784   pdata->kh.set_dynamic_scheduling(true);
785   pdata->reusesym = product->api_user;
786 
787   /* TODO: add command line options to select spgemm algorithms */
788   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
789   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
790      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
791    */
792   pdata->kh.create_spgemm_handle(spgemm_alg);
793 
794   PetscCall(PetscLogGpuTimeBegin());
795   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
796   if (transA) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
798     transA = false;
799   }
800 
801   if (transB) {
802     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
803     transB = false;
804   }
805 
806   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
808     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
809     calling new Mat_SeqAIJKokkos().
810     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
811   */
812   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
813   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
814   PetscCall(PetscLogGpuTimeEnd());
815 
816   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
817   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
818   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
819   PetscFunctionReturn(0);
820 }
821 
822 /* handles sparse matrix matrix ops */
823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
824 {
825   Mat_Product    *product = mat->product;
826   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827 
828   PetscFunctionBegin;
829   MatCheckProduct(mat,1);
830   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
831   if (product->type == MATPRODUCT_ABC) {
832     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
833   }
834   if (Biskok && Ciskok) {
835     switch (product->type) {
836     case MATPRODUCT_AB:
837     case MATPRODUCT_AtB:
838     case MATPRODUCT_ABt:
839       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840       break;
841     case MATPRODUCT_PtAP:
842     case MATPRODUCT_RARt:
843     case MATPRODUCT_ABC:
844       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845       break;
846     default:
847       break;
848     }
849   } else { /* fallback for AIJ */
850     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856 {
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   PetscCall(PetscLogGpuTimeBegin());
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864   PetscCall(MatSeqAIJKokkosModifyDevice(A));
865   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
866   PetscCall(PetscLogGpuTimeEnd());
867   PetscFunctionReturn(0);
868 }
869 
870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871 {
872   Mat_SeqAIJKokkos *aijkok;
873 
874   PetscFunctionBegin;
875   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
876   if (aijkok) { /* Only zero the device if data is already there */
877     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
878     PetscCall(MatSeqAIJKokkosModifyDevice(A));
879   } else { /* Might be preallocated but not assembled */
880     PetscCall(MatZeroEntries_SeqAIJ(A));
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
886 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
887 {
888   Mat_SeqAIJKokkos   *aijkok;
889 
890   PetscFunctionBegin;
891   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
892   PetscValidPointer(kv,2);
893   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
894   PetscCall(MatSeqAIJKokkosSyncDevice(A));
895   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
896   *kv    = aijkok->a_dual.view_device();
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
901 {
902   PetscFunctionBegin;
903   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
904   PetscValidPointer(kv,2);
905   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
910 {
911   Mat_SeqAIJKokkos   *aijkok;
912 
913   PetscFunctionBegin;
914   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
915   PetscValidPointer(kv,2);
916   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
917   PetscCall(MatSeqAIJKokkosSyncDevice(A));
918   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
919   *kv    = aijkok->a_dual.view_device();
920   PetscFunctionReturn(0);
921 }
922 
923 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
924 {
925   PetscFunctionBegin;
926   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
927   PetscValidPointer(kv,2);
928   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
929   PetscCall(MatSeqAIJKokkosModifyDevice(A));
930   PetscFunctionReturn(0);
931 }
932 
933 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
934 {
935   Mat_SeqAIJKokkos   *aijkok;
936 
937   PetscFunctionBegin;
938   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
939   PetscValidPointer(kv,2);
940   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
941   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
942   *kv    = aijkok->a_dual.view_device();
943   PetscFunctionReturn(0);
944 }
945 
946 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
947 {
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   PetscCall(MatSeqAIJKokkosModifyDevice(A));
953   PetscFunctionReturn(0);
954 }
955 
956 /* Computes Y += alpha X */
957 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958 {
959   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
960   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
961   ConstMatScalarKokkosView   Xa;
962   MatScalarKokkosView        Ya;
963 
964   PetscFunctionBegin;
965   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
966   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
967   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
968   PetscCall(MatSeqAIJKokkosSyncDevice(X));
969   PetscCall(PetscLogGpuTimeBegin());
970 
971   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
972     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
973     PetscBool e;
974     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
975     if (e) {
976       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
977       if (e) pattern = SAME_NONZERO_PATTERN;
978     }
979   }
980 
981   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
982     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
983     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
984     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
985   */
986   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
987   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
988   Xa   = xkok->a_dual.view_device();
989   Ya   = ykok->a_dual.view_device();
990 
991   if (pattern == SAME_NONZERO_PATTERN) {
992     KokkosBlas::axpy(alpha,Xa,Ya);
993     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
994   } else if (pattern == SUBSET_NONZERO_PATTERN) {
995     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
996     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
997 
998     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
999       PetscInt i = t.league_rank(); /* row i */
1000       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1001         PetscInt p,q = Yi(i);
1002         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1003           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1004           if (Xj(p) == Yj(q)) { /* Found it */
1005             Ya(q) += alpha * Xa(p);
1006             q++;
1007           } else {
1008             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1009                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1010             */
1011             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
1012            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
1013               Kokkos::nan("1"); /* auto promote the double NaN if needed */
1014            #else
1015               Kokkos::Experimental::nan("1");
1016            #endif
1017           }
1018         }
1019       });
1020     });
1021     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1022   } else { /* different nonzero patterns */
1023     Mat             Z;
1024     KokkosCsrMatrix zcsr;
1025     KernelHandle    kh;
1026     kh.create_spadd_handle(false);
1027     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1028     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1029     zkok = new Mat_SeqAIJKokkos(zcsr);
1030     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1031     PetscCall(MatHeaderReplace(Y,&Z));
1032     kh.destroy_spadd_handle();
1033   }
1034   PetscCall(PetscLogGpuTimeEnd());
1035   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1036   PetscFunctionReturn(0);
1037 }
1038 
1039 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1040 {
1041   Mat_SeqAIJKokkos *akok;
1042   Mat_SeqAIJ       *aseq;
1043 
1044   PetscFunctionBegin;
1045   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1046   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1047   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1048   delete akok;
1049   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1050   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1051   akok->SetUpCOO(aseq);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1056 {
1057   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1058   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1059   PetscCount                  Annz = aseq->nz;
1060   const PetscCountKokkosView& jmap = akok->jmap_d;
1061   const PetscCountKokkosView& perm = akok->perm_d;
1062   MatScalarKokkosView         Aa;
1063   ConstMatScalarKokkosView    kv;
1064   PetscMemType                memtype;
1065 
1066   PetscFunctionBegin;
1067   PetscCall(PetscGetMemType(v,&memtype));
1068   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1069     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1070   } else {
1071     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1072   }
1073 
1074   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1075   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1076 
1077   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1078     PetscScalar sum = 0.0;
1079     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1080     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1081   });
1082 
1083   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1084   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1089 {
1090   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1091   MatScalarKokkosView         Aa;
1092   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1093   PetscInt                    m = A->rmap->n;
1094   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1095 
1096   PetscFunctionBegin;
1097   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1098   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1099     PetscScalar tmp;
1100     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1101       tmp          = Aa(Ai(i));
1102       Aa(Ai(i))    = Aa(Adiag(i));
1103       Aa(Adiag(i)) = tmp;
1104     }
1105   });
1106   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1107   PetscFunctionReturn(0);
1108 }
1109 
1110 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1111 {
1112   PetscFunctionBegin;
1113   PetscCall(MatSeqAIJKokkosSyncHost(A));
1114   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1115   B->offloadmask = PETSC_OFFLOAD_CPU;
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1120 {
1121   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1122 
1123   PetscFunctionBegin;
1124   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1125   A->boundtocpu  = PETSC_FALSE;
1126 
1127   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1128   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1129   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1130   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1131   A->ops->scale                     = MatScale_SeqAIJKokkos;
1132   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1133   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1134   A->ops->mult                      = MatMult_SeqAIJKokkos;
1135   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1136   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1137   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1138   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1139   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1140   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1141   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1142   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1143   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1144   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1145   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1146   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1147   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1148   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1149   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1150 
1151   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1152   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1157 {
1158   Mat_SeqAIJ *aseq;
1159   PetscInt    i,m,n;
1160 
1161   PetscFunctionBegin;
1162   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1163 
1164   m = akok->nrows();
1165   n = akok->ncols();
1166   PetscCall(MatSetSizes(A,m,n,m,n));
1167   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1168 
1169   /* Set up data structures of A as a MATSEQAIJ */
1170   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1171   aseq = (Mat_SeqAIJ*)(A)->data;
1172 
1173   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1174   akok->j_dual.sync_host();
1175 
1176   aseq->i            = akok->i_host_data();
1177   aseq->j            = akok->j_host_data();
1178   aseq->a            = akok->a_host_data();
1179   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1180   aseq->singlemalloc = PETSC_FALSE;
1181   aseq->free_a       = PETSC_FALSE;
1182   aseq->free_ij      = PETSC_FALSE;
1183   aseq->nz           = akok->nnz();
1184   aseq->maxnz        = aseq->nz;
1185 
1186   PetscCall(PetscMalloc1(m,&aseq->imax));
1187   PetscCall(PetscMalloc1(m,&aseq->ilen));
1188   for (i=0; i<m; i++) {
1189     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1190   }
1191 
1192   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1193   akok->nonzerostate = A->nonzerostate;
1194   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1195   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1196   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1201 
1202    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1203  */
1204 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1205 {
1206   PetscFunctionBegin;
1207   PetscCall(MatCreate(comm,A));
1208   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 /* --------------------------------------------------------------------------------*/
1213 /*@C
1214    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1215    (the default parallel PETSc format). This matrix will ultimately be handled by
1216    Kokkos for calculations. For good matrix
1217    assembly performance the user should preallocate the matrix storage by setting
1218    the parameter nz (or the array nnz).  By setting these parameters accurately,
1219    performance during matrix assembly can be increased by more than a factor of 50.
1220 
1221    Collective
1222 
1223    Input Parameters:
1224 +  comm - MPI communicator, set to PETSC_COMM_SELF
1225 .  m - number of rows
1226 .  n - number of columns
1227 .  nz - number of nonzeros per row (same for all rows)
1228 -  nnz - array containing the number of nonzeros in the various rows
1229          (possibly different for each row) or NULL
1230 
1231    Output Parameter:
1232 .  A - the matrix
1233 
1234    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1235    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1236    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1237 
1238    Notes:
1239    If nnz is given then nz is ignored
1240 
1241    The AIJ format (also called the Yale sparse matrix format or
1242    compressed row storage), is fully compatible with standard Fortran 77
1243    storage.  That is, the stored row and column indices can begin at
1244    either one (as in Fortran) or zero.  See the users' manual for details.
1245 
1246    Specify the preallocated storage with either nz or nnz (not both).
1247    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1248    allocation.  For large problems you MUST preallocate memory or you
1249    will get TERRIBLE performance, see the users' manual chapter on matrices.
1250 
1251    By default, this format uses inodes (identical nodes) when possible, to
1252    improve numerical efficiency of matrix-vector products and solves. We
1253    search for consecutive rows with the same nonzero structure, thereby
1254    reusing matrix information to achieve increased efficiency.
1255 
1256    Level: intermediate
1257 
1258 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1259 @*/
1260 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1261 {
1262   PetscFunctionBegin;
1263   PetscCall(PetscKokkosInitializeCheck());
1264   PetscCall(MatCreate(comm,A));
1265   PetscCall(MatSetSizes(*A,m,n,m,n));
1266   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1267   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 typedef Kokkos::TeamPolicy<>::member_type team_member;
1272 //
1273 // This factorization exploits block diagonal matrices with "Nf" (not used).
1274 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1275 //
1276 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1277 {
1278   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1279   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1280   IS                 isrow = b->row,isicol = b->icol;
1281   const PetscInt     *r_h,*ic_h;
1282   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1283   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1284   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1285   PetscBool          row_identity,col_identity;
1286   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1287 
1288   PetscFunctionBegin;
1289   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1290   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
1291   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1292   PetscCall(ISGetIndices(isrow,&r_h));
1293   PetscCall(ISGetIndices(isicol,&ic_h));
1294   PetscCall(ISGetSize(isicol,&nc));
1295   PetscCall(PetscLogGpuTimeBegin());
1296   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1297   {
1298 #define KOKKOS_SHARED_LEVEL 1
1299     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1300     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1301     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1302     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1303     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1304     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1305     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1306     size_t flops_h = 0.0;
1307     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1308     Kokkos::View<size_t> d_flops_k ("flops");
1309     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1310     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1311     Kokkos::deep_copy (d_flops_k, h_flops_k);
1312     Kokkos::deep_copy (d_r_k, h_r_k);
1313     Kokkos::deep_copy (d_ic_k, h_ic_k);
1314     // Fill A --> fact
1315     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1316         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1317         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1318         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1319         // zero rows of B
1320         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1321             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1322             PetscScalar *baL = ba_d + bi_d[rowb];
1323             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1324             /* zero (unfactored row) */
1325             for (int j=0;j<nzbL;j++) baL[j] = 0;
1326             for (int j=0;j<nzbU;j++) baU[j] = 0;
1327           });
1328         // copy A into B
1329         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1330             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1331             const PetscScalar *av    = aa_d + ai_d[rowa];
1332             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1333             /* load in initial (unfactored row) */
1334             for (int j=0;j<nza;j++) {
1335               PetscInt    colb = ic[ajtmp[j]];
1336               PetscScalar vala = av[j];
1337               if (colb == rowb) {
1338                 *(ba_d + bdiag_d[rowb]) = vala;
1339               } else {
1340                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1341                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1342                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1343                 for (int j=0; j<nz ; j++) {
1344                   if (pbj[j] == colb) {
1345                     pba[j] = vala;
1346                     set++;
1347                     break;
1348                   }
1349                 }
1350                #if !defined(PETSC_HAVE_SYCL)
1351                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1352                #endif
1353               }
1354             }
1355           });
1356       });
1357     Kokkos::fence();
1358 
1359     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1360         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1361         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1362         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1363         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1364         const PetscInt  start = field*nloc, end = start + nloc;
1365         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1366         // A22 panel update for each row A(1,:) and col A(:,1)
1367         for (int ii=start; ii<end-1; ii++) {
1368           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1369           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1370           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1371           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1372           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1373               PetscInt kIdx = j*Ni + field_block_idx;
1374               if (kIdx >= nzUi) /* void */ ;
1375               else {
1376                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1377                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1378                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1379                 size_t         st_idx;
1380                 // find and do L(k,i) = A(:k,i) / A(i,i)
1381                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1382                 // get column, there has got to be a better way
1383                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1384                     if (pjL[j] == ii) {
1385                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1386                       idx = j; // output
1387                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1388                     }
1389                 }, st_idx);
1390                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1391 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1392                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1393 #endif
1394                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1395                 // U(i+1,:end)
1396                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1397                       PetscScalar Uij = baUi[uiIdx];
1398                       PetscInt    col = bjUi[uiIdx];
1399                       if (col==myk) {
1400                         // A_kk = A_kk - L_ki * U_ij(k)
1401                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1402                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1403                       } else {
1404                         PetscScalar    *start, *end, *pAkjv=NULL;
1405                         PetscInt       high, low;
1406                         const PetscInt *startj;
1407                         if (col<myk) { // L
1408                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1409                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1410                           start = pLki+1; // start at pLki+1, A22(myk,1)
1411                           startj= bj_d + bi_d[myk] + idx;
1412                           end   = ba_d + bi_d[myk+1];
1413                         } else {
1414                           PetscInt idx = bdiag_d[myk+1]+1;
1415                           start = ba_d + idx;
1416                           startj= bj_d + idx;
1417                           end   = ba_d + bdiag_d[myk];
1418                         }
1419                         // search for 'col', use bisection search - TODO
1420                         low  = 0;
1421                         high = (PetscInt)(end-start);
1422                         while (high-low > 5) {
1423                           int t = (low+high)/2;
1424                           if (startj[t] > col) high = t;
1425                           else                 low  = t;
1426                         }
1427                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1428                           if (startj[pAkjv-start] == col) break;
1429                         }
1430 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1431                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1432 #endif
1433                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1434                       }
1435                     });
1436               }
1437             });
1438           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1439           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1440         } /* endof for (i=0; i<n; i++) { */
1441         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1442       });
1443     Kokkos::fence();
1444     Kokkos::deep_copy (h_flops_k, d_flops_k);
1445     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1446     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1447         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1448         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1449         /* Invert diagonal for simpler triangular solves */
1450         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1451             int i = start + outer_index*Ni + lg_rank%Ni;
1452             if (i < end) {
1453               PetscScalar *pv = ba_d + bdiag_d[i];
1454               *pv = 1.0/(*pv);
1455             }
1456           });
1457       });
1458   }
1459   PetscCall(PetscLogGpuTimeEnd());
1460   PetscCall(ISRestoreIndices(isicol,&ic_h));
1461   PetscCall(ISRestoreIndices(isrow,&r_h));
1462 
1463   PetscCall(ISIdentity(isrow,&row_identity));
1464   PetscCall(ISIdentity(isicol,&col_identity));
1465   if (b->inode.size) {
1466     B->ops->solve = MatSolve_SeqAIJ_Inode;
1467   } else if (row_identity && col_identity) {
1468     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1469   } else {
1470     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1471   }
1472   B->offloadmask = PETSC_OFFLOAD_GPU;
1473   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1474   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1475   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1476   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1477   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1478   B->assembled              = PETSC_TRUE;
1479   B->preallocated           = PETSC_TRUE;
1480 
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1485 {
1486   PetscFunctionBegin;
1487   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1488   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1493 {
1494   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1495 
1496   PetscFunctionBegin;
1497   if (!factors->sptrsv_symbolic_completed) {
1498     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1499     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1500     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1501   }
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 /* Check if we need to update factors etc for transpose solve */
1506 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1507 {
1508   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1509   MatColIdxType             n = A->rmap->n;
1510 
1511   PetscFunctionBegin;
1512   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1513     /* Update L^T and do sptrsv symbolic */
1514     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1515     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1516     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1517     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1518 
1519     KokkosKernels::Impl::transpose_matrix<
1520       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1521       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1522       MatRowMapKokkosView,DefaultExecutionSpace>(
1523         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1524         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1525 
1526     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1527       We have to sort the indices, until KK provides finer control options.
1528     */
1529     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1530       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1531         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1532 
1533     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1534 
1535     /* Update U^T and do sptrsv symbolic */
1536     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1537     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1538     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1539     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1540 
1541     KokkosKernels::Impl::transpose_matrix<
1542       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1543       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1544       MatRowMapKokkosView,DefaultExecutionSpace>(
1545         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1546         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1547 
1548     /* Sort indices. See comments above */
1549     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1550       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1551         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1552 
1553     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1554     factors->transpose_updated = PETSC_TRUE;
1555   }
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 /* Solve Ax = b, with A = LU */
1560 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1561 {
1562   ConstPetscScalarKokkosView     bv;
1563   PetscScalarKokkosView          xv;
1564   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1565 
1566   PetscFunctionBegin;
1567   PetscCall(PetscLogGpuTimeBegin());
1568   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1569   PetscCall(VecGetKokkosView(b,&bv));
1570   PetscCall(VecGetKokkosViewWrite(x,&xv));
1571   /* Solve L tmpv = b */
1572   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1573   /* Solve Ux = tmpv */
1574   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1575   PetscCall(VecRestoreKokkosView(b,&bv));
1576   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1577   PetscCall(PetscLogGpuTimeEnd());
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 /* Solve A^T x = b, where A^T = U^T L^T */
1582 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1583 {
1584   ConstPetscScalarKokkosView     bv;
1585   PetscScalarKokkosView          xv;
1586   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1587 
1588   PetscFunctionBegin;
1589   PetscCall(PetscLogGpuTimeBegin());
1590   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1591   PetscCall(VecGetKokkosView(b,&bv));
1592   PetscCall(VecGetKokkosViewWrite(x,&xv));
1593   /* Solve U^T tmpv = b */
1594   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1595 
1596   /* Solve L^T x = tmpv */
1597   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1598   PetscCall(VecRestoreKokkosView(b,&bv));
1599   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1600   PetscCall(PetscLogGpuTimeEnd());
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1605 {
1606   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1607   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1608   PetscInt                       fill_lev = info->levels;
1609 
1610   PetscFunctionBegin;
1611   PetscCall(PetscLogGpuTimeBegin());
1612   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1613 
1614   auto a_d = aijkok->a_dual.view_device();
1615   auto i_d = aijkok->i_dual.view_device();
1616   auto j_d = aijkok->j_dual.view_device();
1617 
1618   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1619 
1620   B->assembled                       = PETSC_TRUE;
1621   B->preallocated                    = PETSC_TRUE;
1622   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1623   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1624   B->ops->matsolve                   = NULL;
1625   B->ops->matsolvetranspose          = NULL;
1626   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1627 
1628   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1629   factors->transpose_updated         = PETSC_FALSE;
1630   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1631   /* TODO: log flops, but how to know that? */
1632   PetscCall(PetscLogGpuTimeEnd());
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1637 {
1638   Mat_SeqAIJKokkos               *aijkok;
1639   Mat_SeqAIJ                     *b;
1640   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1641   PetscInt                       fill_lev = info->levels;
1642   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1643   PetscInt                       n = A->rmap->n;
1644 
1645   PetscFunctionBegin;
1646   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1647   /* Rebuild factors */
1648   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1649   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1650 
1651   /* Create a spiluk handle and then do symbolic factorization */
1652   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1653   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1654 
1655   auto spiluk_handle = factors->kh.get_spiluk_handle();
1656 
1657   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1658   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1659   Kokkos::realloc(factors->iU_d,n+1);
1660   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1661 
1662   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1663   auto i_d = aijkok->i_dual.view_device();
1664   auto j_d = aijkok->j_dual.view_device();
1665   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1666   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1667 
1668   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1669   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1670   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1671   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1672 
1673   /* TODO: add options to select sptrsv algorithms */
1674   /* Create sptrsv handles for L, U and their transpose */
1675  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1676   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1677  #else
1678   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1679  #endif
1680 
1681   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1682   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1683   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1684   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1685 
1686   /* Fill fields of the factor matrix B */
1687   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1688   b     = (Mat_SeqAIJ*)B->data;
1689   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1690   B->info.fill_ratio_given  = info->fill;
1691   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1692 
1693   B->offloadmask            = PETSC_OFFLOAD_GPU;
1694   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1695   PetscFunctionReturn(0);
1696 }
1697 
1698 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1699 {
1700   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1701   const PetscInt   nrows   = A->rmap->n;
1702 
1703   PetscFunctionBegin;
1704   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1705   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1706   // move B data into Kokkos
1707   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1708   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1709   {
1710     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1711     if (!baijkok->diag_d.extent(0)) {
1712       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1713       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1714       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1715     }
1716   }
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1721 {
1722   PetscFunctionBegin;
1723   *type = MATSOLVERKOKKOS;
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1728 {
1729   PetscFunctionBegin;
1730   *type = MATSOLVERKOKKOSDEVICE;
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 /*MC
1735   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1736   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1737 
1738   Level: beginner
1739 
1740 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1741 M*/
1742 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1743 {
1744   PetscInt       n = A->rmap->n;
1745 
1746   PetscFunctionBegin;
1747   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1748   PetscCall(MatSetSizes(*B,n,n,n,n));
1749   (*B)->factortype = ftype;
1750   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1751   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1752 
1753   if (ftype == MAT_FACTOR_LU) {
1754     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1755     (*B)->canuseordering         = PETSC_TRUE;
1756     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1757   } else if (ftype == MAT_FACTOR_ILU) {
1758     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1759     (*B)->canuseordering         = PETSC_FALSE;
1760     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1761   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1762 
1763   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1764   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1765   PetscFunctionReturn(0);
1766 }
1767 
1768 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1769 {
1770   PetscInt       n = A->rmap->n;
1771 
1772   PetscFunctionBegin;
1773   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1774   PetscCall(MatSetSizes(*B,n,n,n,n));
1775   (*B)->factortype = ftype;
1776   (*B)->canuseordering = PETSC_TRUE;
1777   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1778   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1779 
1780   if (ftype == MAT_FACTOR_LU) {
1781     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1782     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1783   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1784 
1785   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1786   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1787   PetscFunctionReturn(0);
1788 }
1789 
1790 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1791 {
1792   PetscFunctionBegin;
1793   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1794   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1795   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 /* Utility to print out a KokkosCsrMatrix for debugging */
1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1801 {
1802   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1803   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1804   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1805   const PetscInt    *i = iv.data();
1806   const PetscInt    *j = jv.data();
1807   const PetscScalar *a = av.data();
1808   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1809 
1810   PetscFunctionBegin;
1811   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1812   for (PetscInt k=0; k<m; k++) {
1813     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1814     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1815       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1816     }
1817     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1818   }
1819   PetscFunctionReturn(0);
1820 }
1821