xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c66e0907e1a0ab9d8ee6ee3f7f3f2f207451f213)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
105     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
106     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
107     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
108   */
109   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110     aijkok->a_dual.sync_host();
111     *array = aijkok->a_dual.view_host().data();
112   } else { /* Happens when calling MatSetValues on a newly created matrix */
113     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114   }
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119 {
120   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121 
122   PetscFunctionBegin;
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     aijkok->a_dual.sync_host();
134     *array = aijkok->a_dual.view_host().data();
135   } else {
136     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142 {
143   PetscFunctionBegin;
144   *array = NULL;
145   PetscFunctionReturn(0);
146 }
147 
148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149 {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162 {
163   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164 
165   PetscFunctionBegin;
166   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167     aijkok->a_dual.clear_sync_state();
168     aijkok->a_dual.modify_host();
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
174 {
175   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176 
177   PetscFunctionBegin;
178   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
179 
180   if (i) *i = aijkok->i_device_data();
181   if (j) *j = aijkok->j_device_data();
182   if (a) {
183     aijkok->a_dual.sync_device();
184     *a = aijkok->a_device_data();
185   }
186   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
187   PetscFunctionReturn(0);
188 }
189 
190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192 {
193   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195 
196   PetscFunctionBegin;
197   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200   PetscFunctionReturn(0);
201 }
202 
203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && aijkok->device_mat_d.data()) {
210     *d_mat = aijkok->device_mat_d.data();
211   } else {
212     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213     *d_mat  = NULL;
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the transpose on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
227     PetscCallCXX(aijkok->a_dual.sync_device());
228     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
230     aijkok->transpose_updated = PETSC_TRUE;
231   }
232   *csrmatT = &aijkok->csrmatT;
233   PetscFunctionReturn(0);
234 }
235 
236 /* Generate the Hermitian on device and cache it internally */
237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238 {
239   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240 
241   PetscFunctionBegin;
242   PetscCall(PetscLogGpuTimeBegin());
243   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
245     PetscCallCXX(aijkok->a_dual.sync_device());
246     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
247     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248    #if defined(PETSC_USE_COMPLEX)
249     const auto& a = aijkok->csrmatH.values;
250     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251    #endif
252     aijkok->hermitian_updated = PETSC_TRUE;
253   }
254   *csrmatH = &aijkok->csrmatH;
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A x */
260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   ConstPetscScalarKokkosView       xv;
264   PetscScalarKokkosView            yv;
265 
266   PetscFunctionBegin;
267   PetscCall(PetscLogGpuTimeBegin());
268   PetscCall(MatSeqAIJKokkosSyncDevice(A));
269   PetscCall(VecGetKokkosView(xx,&xv));
270   PetscCall(VecGetKokkosViewWrite(yy,&yv));
271   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
273   PetscCall(VecRestoreKokkosView(xx,&xv));
274   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
276   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
277   PetscCall(PetscLogGpuTimeEnd());
278   PetscFunctionReturn(0);
279 }
280 
281 /* y = A^T x */
282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   PetscCall(PetscLogGpuTimeBegin());
292   PetscCall(MatSeqAIJKokkosSyncDevice(A));
293   PetscCall(VecGetKokkosView(xx,&xv));
294   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295   if (A->form_explicit_transpose) {
296     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297     mode = "N";
298   } else {
299     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300     csrmat = &aijkok->csrmat;
301     mode = "T";
302   }
303   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
304   PetscCall(VecRestoreKokkosView(xx,&xv));
305   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
306   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
307   PetscCall(PetscLogGpuTimeEnd());
308   PetscFunctionReturn(0);
309 }
310 
311 /* y = A^H x */
312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqAIJKokkos                 *aijkok;
315   const char                       *mode;
316   ConstPetscScalarKokkosView       xv;
317   PetscScalarKokkosView            yv;
318   KokkosCsrMatrix                  *csrmat;
319 
320   PetscFunctionBegin;
321   PetscCall(PetscLogGpuTimeBegin());
322   PetscCall(MatSeqAIJKokkosSyncDevice(A));
323   PetscCall(VecGetKokkosView(xx,&xv));
324   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325   if (A->form_explicit_transpose) {
326     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327     mode = "N";
328   } else {
329     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330     csrmat = &aijkok->csrmat;
331     mode = "C";
332   }
333   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
334   PetscCall(VecRestoreKokkosView(xx,&xv));
335   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
336   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
337   PetscCall(PetscLogGpuTimeEnd());
338   PetscFunctionReturn(0);
339 }
340 
341 /* z = A x + y */
342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
343 {
344   Mat_SeqAIJKokkos                 *aijkok;
345   ConstPetscScalarKokkosView       xv,yv;
346   PetscScalarKokkosView            zv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx,&xv));
352   PetscCall(VecGetKokkosView(yy,&yv));
353   PetscCall(VecGetKokkosViewWrite(zz,&zv));
354   if (zz != yy) Kokkos::deep_copy(zv,yv);
355   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
357   PetscCall(VecRestoreKokkosView(xx,&xv));
358   PetscCall(VecRestoreKokkosView(yy,&yv));
359   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
360   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
361   PetscCall(PetscLogGpuTimeEnd());
362   PetscFunctionReturn(0);
363 }
364 
365 /* z = A^T x + y */
366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
367 {
368   Mat_SeqAIJKokkos                 *aijkok;
369   const char                       *mode;
370   ConstPetscScalarKokkosView       xv,yv;
371   PetscScalarKokkosView            zv;
372   KokkosCsrMatrix                  *csrmat;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx,&xv));
378   PetscCall(VecGetKokkosView(yy,&yv));
379   PetscCall(VecGetKokkosViewWrite(zz,&zv));
380   if (zz != yy) Kokkos::deep_copy(zv,yv);
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386     csrmat = &aijkok->csrmat;
387     mode = "T";
388   }
389   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
390   PetscCall(VecRestoreKokkosView(xx,&xv));
391   PetscCall(VecRestoreKokkosView(yy,&yv));
392   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
393   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
394   PetscCall(PetscLogGpuTimeEnd());
395   PetscFunctionReturn(0);
396 }
397 
398 /* z = A^H x + y */
399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
400 {
401   Mat_SeqAIJKokkos                 *aijkok;
402   const char                       *mode;
403   ConstPetscScalarKokkosView       xv,yv;
404   PetscScalarKokkosView            zv;
405   KokkosCsrMatrix                  *csrmat;
406 
407   PetscFunctionBegin;
408   PetscCall(PetscLogGpuTimeBegin());
409   PetscCall(MatSeqAIJKokkosSyncDevice(A));
410   PetscCall(VecGetKokkosView(xx,&xv));
411   PetscCall(VecGetKokkosView(yy,&yv));
412   PetscCall(VecGetKokkosViewWrite(zz,&zv));
413   if (zz != yy) Kokkos::deep_copy(zv,yv);
414   if (A->form_explicit_transpose) {
415     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416     mode = "N";
417   } else {
418     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419     csrmat = &aijkok->csrmat;
420     mode = "C";
421   }
422   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
423   PetscCall(VecRestoreKokkosView(xx,&xv));
424   PetscCall(VecRestoreKokkosView(yy,&yv));
425   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
426   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
427   PetscCall(PetscLogGpuTimeEnd());
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432 {
433   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434 
435   PetscFunctionBegin;
436   switch (op) {
437     case MAT_FORM_EXPLICIT_TRANSPOSE:
438       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
439       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440       A->form_explicit_transpose = flg;
441       break;
442     default:
443       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444       break;
445   }
446   PetscFunctionReturn(0);
447 }
448 
449 /* Depending on reuse, either build a new mat, or use the existing mat */
450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
451 {
452   Mat_SeqAIJ       *aseq;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscKokkosInitializeCheck());
456   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
457     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
458   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
459     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
461     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
462     PetscCall(PetscFree(A->defaultvectype));
463     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
464     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
465     PetscCall(MatSetOps_SeqAIJKokkos(A));
466     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
468       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
470     }
471   }
472   PetscFunctionReturn(0);
473 }
474 
475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477  */
478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
479 {
480   Mat_SeqAIJ            *bseq;
481   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482   Mat                   mat;
483 
484   PetscFunctionBegin;
485   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
486   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487   mat  = *B;
488   if (A->assembled) {
489     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492     /* Now copy values to B if needed */
493     if (dupOption == MAT_COPY_VALUES) {
494       if (akok->a_dual.need_sync_device()) {
495         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496         bkok->a_dual.modify_host();
497       } else { /* If device has the latest data, we only copy data on device */
498         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499         bkok->a_dual.modify_device();
500       }
501     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503       bkok->a_dual.modify_host();
504     }
505     mat->spptr = bkok;
506   }
507 
508   PetscCall(PetscFree(mat->defaultvectype));
509   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
510   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
511   PetscCall(MatSetOps_SeqAIJKokkos(mat));
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
516 {
517   Mat               At;
518   KokkosCsrMatrix   *internT;
519   Mat_SeqAIJKokkos  *atkok,*bkok;
520 
521   PetscFunctionBegin;
522   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B));
523   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
524   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
525     /* Deep copy internT, as we want to isolate the internal transpose */
526     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
527     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
528     if (reuse == MAT_INITIAL_MATRIX) *B = At;
529     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
530   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
531     if ((*B)->assembled) {
532       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
533       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
534       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
535     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
536       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
537       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
538       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
539       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
540       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
541     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
542   }
543   PetscFunctionReturn(0);
544 }
545 
546 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
547 {
548   Mat_SeqAIJKokkos           *aijkok;
549 
550   PetscFunctionBegin;
551   if (A->factortype == MAT_FACTOR_NONE) {
552     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
553     delete aijkok;
554   } else {
555     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
556   }
557   A->spptr = NULL;
558   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
559   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
560   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
561   PetscCall(MatDestroy_SeqAIJ(A));
562   PetscFunctionReturn(0);
563 }
564 
565 /*MC
566    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
567 
568    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
569 
570    Options Database Keys:
571 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
572 
573   Level: beginner
574 
575 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
576 M*/
577 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
578 {
579   PetscFunctionBegin;
580   PetscCall(PetscKokkosInitializeCheck());
581   PetscCall(MatCreate_SeqAIJ(A));
582   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
583   PetscFunctionReturn(0);
584 }
585 
586 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
587 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
588 {
589   Mat_SeqAIJ                   *a,*b;
590   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
591   MatScalarKokkosView          aa,ba,ca;
592   MatRowMapKokkosView          ai,bi,ci;
593   MatColIdxKokkosView          aj,bj,cj;
594   PetscInt                     m,n,nnz,aN;
595 
596   PetscFunctionBegin;
597   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
598   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
599   PetscValidPointer(C,4);
600   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
601   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
602   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
603   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
604 
605   PetscCall(MatSeqAIJKokkosSyncDevice(A));
606   PetscCall(MatSeqAIJKokkosSyncDevice(B));
607   a    = static_cast<Mat_SeqAIJ*>(A->data);
608   b    = static_cast<Mat_SeqAIJ*>(B->data);
609   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
610   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
611   aa   = akok->a_dual.view_device();
612   ai   = akok->i_dual.view_device();
613   ba   = bkok->a_dual.view_device();
614   bi   = bkok->i_dual.view_device();
615   m    = A->rmap->n; /* M, N and nnz of C */
616   n    = A->cmap->n + B->cmap->n;
617   nnz  = a->nz + b->nz;
618   aN   = A->cmap->n; /* N of A */
619   if (reuse == MAT_INITIAL_MATRIX) {
620     aj = akok->j_dual.view_device();
621     bj = bkok->j_dual.view_device();
622     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
623     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
624     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
625     ca = ca_dual.view_device();
626     ci = ci_dual.view_device();
627     cj = cj_dual.view_device();
628 
629     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
630     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
631       PetscInt i = t.league_rank(); /* row i */
632       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
633 
634       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
635         ci(i) = coffset;
636         if (i == m-1) ci(m) = ai(m) + bi(m);
637       });
638 
639       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
640         if (k < alen) {
641           ca(coffset+k) = aa(ai(i)+k);
642           cj(coffset+k) = aj(ai(i)+k);
643         } else {
644           ca(coffset+k) = ba(bi(i)+k-alen);
645           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
646         }
647       });
648     });
649     ca_dual.modify_device();
650     ci_dual.modify_device();
651     cj_dual.modify_device();
652     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
653     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
654   } else if (reuse == MAT_REUSE_MATRIX) {
655     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
656     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
657     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
658     ca   = ckok->a_dual.view_device();
659     ci   = ckok->i_dual.view_device();
660 
661     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
662       PetscInt i = t.league_rank(); /* row i */
663       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
664       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
665         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
666         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
667       });
668     });
669     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
670   }
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
675 {
676   PetscFunctionBegin;
677   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
678   PetscFunctionReturn(0);
679 }
680 
681 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
682 {
683   Mat_Product                    *product = C->product;
684   Mat                            A,B;
685   bool                           transA,transB; /* use bool, since KK needs this type */
686   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
687   Mat_SeqAIJ                     *c;
688   MatProductData_SeqAIJKokkos    *pdata;
689   KokkosCsrMatrix                *csrmatA,*csrmatB;
690 
691   PetscFunctionBegin;
692   MatCheckProduct(C,1);
693   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
694   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
695 
696   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
697     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
698     PetscFunctionReturn(0);
699   }
700 
701   switch (product->type) {
702     case MATPRODUCT_AB:  transA = false; transB = false; break;
703     case MATPRODUCT_AtB: transA = true;  transB = false; break;
704     case MATPRODUCT_ABt: transA = false; transB = true;  break;
705     default:
706       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
707   }
708 
709   A     = product->A;
710   B     = product->B;
711   PetscCall(MatSeqAIJKokkosSyncDevice(A));
712   PetscCall(MatSeqAIJKokkosSyncDevice(B));
713   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
714   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
715   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
716 
717   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
718 
719   csrmatA = &akok->csrmat;
720   csrmatB = &bkok->csrmat;
721 
722   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
723   if (transA) {
724     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
725     transA = false;
726   }
727 
728   if (transB) {
729     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
730     transB = false;
731   }
732   PetscCall(PetscLogGpuTimeBegin());
733   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
734   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
735   PetscCall(PetscLogGpuTimeEnd());
736   PetscCall(MatSeqAIJKokkosModifyDevice(C));
737   /* shorter version of MatAssemblyEnd_SeqAIJ */
738   c = (Mat_SeqAIJ*)C->data;
739   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
740   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
741   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
742   c->reallocs         = 0;
743   C->info.mallocs     = 0;
744   C->info.nz_unneeded = 0;
745   C->assembled        = C->was_assembled = PETSC_TRUE;
746   C->num_ass++;
747   PetscFunctionReturn(0);
748 }
749 
750 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
751 {
752   Mat_Product                    *product = C->product;
753   MatProductType                 ptype;
754   Mat                            A,B;
755   bool                           transA,transB;
756   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
757   MatProductData_SeqAIJKokkos    *pdata;
758   MPI_Comm                       comm;
759   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
760 
761   PetscFunctionBegin;
762   MatCheckProduct(C,1);
763   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
764   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
765   A       = product->A;
766   B       = product->B;
767   PetscCall(MatSeqAIJKokkosSyncDevice(A));
768   PetscCall(MatSeqAIJKokkosSyncDevice(B));
769   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
770   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
771   csrmatA = &akok->csrmat;
772   csrmatB = &bkok->csrmat;
773 
774   ptype   = product->type;
775   switch (ptype) {
776     case MATPRODUCT_AB:  transA = false; transB = false; break;
777     case MATPRODUCT_AtB: transA = true;  transB = false; break;
778     case MATPRODUCT_ABt: transA = false; transB = true;  break;
779     default:
780       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
781   }
782 
783   product->data = pdata = new MatProductData_SeqAIJKokkos();
784   pdata->kh.set_team_work_size(16);
785   pdata->kh.set_dynamic_scheduling(true);
786   pdata->reusesym = product->api_user;
787 
788   /* TODO: add command line options to select spgemm algorithms */
789   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
790   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
791      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
792    */
793   pdata->kh.create_spgemm_handle(spgemm_alg);
794 
795   PetscCall(PetscLogGpuTimeBegin());
796   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
797   if (transA) {
798     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
799     transA = false;
800   }
801 
802   if (transB) {
803     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
804     transB = false;
805   }
806 
807   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
808   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
809     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
810     calling new Mat_SeqAIJKokkos().
811     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
812   */
813   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
814   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
815   PetscCall(PetscLogGpuTimeEnd());
816 
817   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
818   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
819   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
820   PetscFunctionReturn(0);
821 }
822 
823 /* handles sparse matrix matrix ops */
824 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
825 {
826   Mat_Product    *product = mat->product;
827   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
828 
829   PetscFunctionBegin;
830   MatCheckProduct(mat,1);
831   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
832   if (product->type == MATPRODUCT_ABC) {
833     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
834   }
835   if (Biskok && Ciskok) {
836     switch (product->type) {
837     case MATPRODUCT_AB:
838     case MATPRODUCT_AtB:
839     case MATPRODUCT_ABt:
840       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
841       break;
842     case MATPRODUCT_PtAP:
843     case MATPRODUCT_RARt:
844     case MATPRODUCT_ABC:
845       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
846       break;
847     default:
848       break;
849     }
850   } else { /* fallback for AIJ */
851     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
852   }
853   PetscFunctionReturn(0);
854 }
855 
856 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
857 {
858   Mat_SeqAIJKokkos *aijkok;
859 
860   PetscFunctionBegin;
861   PetscCall(PetscLogGpuTimeBegin());
862   PetscCall(MatSeqAIJKokkosSyncDevice(A));
863   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
864   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
865   PetscCall(MatSeqAIJKokkosModifyDevice(A));
866   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
867   PetscCall(PetscLogGpuTimeEnd());
868   PetscFunctionReturn(0);
869 }
870 
871 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
872 {
873   Mat_SeqAIJKokkos *aijkok;
874 
875   PetscFunctionBegin;
876   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
877   if (aijkok) { /* Only zero the device if data is already there */
878     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
879     PetscCall(MatSeqAIJKokkosModifyDevice(A));
880   } else { /* Might be preallocated but not assembled */
881     PetscCall(MatZeroEntries_SeqAIJ(A));
882   }
883   PetscFunctionReturn(0);
884 }
885 
886 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)
887 {
888   Mat_SeqAIJ                   *aijseq;
889   Mat_SeqAIJKokkos             *aijkok;
890   PetscInt                     n;
891   PetscScalarKokkosView        xv;
892 
893   PetscFunctionBegin;
894   PetscCall(VecGetLocalSize(x,&n));
895   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
896   PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
897 
898   PetscCall(MatSeqAIJKokkosSyncDevice(A));
899   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
900 
901   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
902     PetscCall(MatMarkDiagonal_SeqAIJ(A));
903     aijseq = static_cast<Mat_SeqAIJ*>(A->data);
904     aijkok->SetDiagonal(aijseq->diag);
905   }
906 
907   const auto& Aa = aijkok->a_dual.view_device();
908   const auto& Ai = aijkok->i_dual.view_device();
909   const auto& Adiag = aijkok->diag_dual.view_device();
910 
911   PetscCall(VecGetKokkosViewWrite(x,&xv));
912   Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
913     if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i));
914     else xv(i) = 0;
915   });
916   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
917   PetscFunctionReturn(0);
918 }
919 
920 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
921 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
922 {
923   Mat_SeqAIJKokkos   *aijkok;
924 
925   PetscFunctionBegin;
926   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
927   PetscValidPointer(kv,2);
928   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
929   PetscCall(MatSeqAIJKokkosSyncDevice(A));
930   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931   *kv    = aijkok->a_dual.view_device();
932   PetscFunctionReturn(0);
933 }
934 
935 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
936 {
937   PetscFunctionBegin;
938   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
939   PetscValidPointer(kv,2);
940   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
941   PetscFunctionReturn(0);
942 }
943 
944 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
945 {
946   Mat_SeqAIJKokkos   *aijkok;
947 
948   PetscFunctionBegin;
949   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950   PetscValidPointer(kv,2);
951   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952   PetscCall(MatSeqAIJKokkosSyncDevice(A));
953   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
954   *kv    = aijkok->a_dual.view_device();
955   PetscFunctionReturn(0);
956 }
957 
958 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
959 {
960   PetscFunctionBegin;
961   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
962   PetscValidPointer(kv,2);
963   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
964   PetscCall(MatSeqAIJKokkosModifyDevice(A));
965   PetscFunctionReturn(0);
966 }
967 
968 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
969 {
970   Mat_SeqAIJKokkos   *aijkok;
971 
972   PetscFunctionBegin;
973   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
974   PetscValidPointer(kv,2);
975   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
976   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
977   *kv    = aijkok->a_dual.view_device();
978   PetscFunctionReturn(0);
979 }
980 
981 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
982 {
983   PetscFunctionBegin;
984   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
985   PetscValidPointer(kv,2);
986   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
987   PetscCall(MatSeqAIJKokkosModifyDevice(A));
988   PetscFunctionReturn(0);
989 }
990 
991 /* Computes Y += alpha X */
992 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
993 {
994   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
995   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
996   ConstMatScalarKokkosView   Xa;
997   MatScalarKokkosView        Ya;
998 
999   PetscFunctionBegin;
1000   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
1001   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
1002   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1003   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1004   PetscCall(PetscLogGpuTimeBegin());
1005 
1006   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1007     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1008     PetscBool e;
1009     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
1010     if (e) {
1011       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
1012       if (e) pattern = SAME_NONZERO_PATTERN;
1013     }
1014   }
1015 
1016   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1017     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1018     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1019     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1020   */
1021   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
1022   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
1023   Xa   = xkok->a_dual.view_device();
1024   Ya   = ykok->a_dual.view_device();
1025 
1026   if (pattern == SAME_NONZERO_PATTERN) {
1027     KokkosBlas::axpy(alpha,Xa,Ya);
1028     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1029   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1030     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1031     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1032 
1033     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1034       PetscInt i = t.league_rank(); /* row i */
1035       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1036         PetscInt p,q = Yi(i);
1037         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1038           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1039           if (Xj(p) == Yj(q)) { /* Found it */
1040             Ya(q) += alpha * Xa(p);
1041             q++;
1042           } else {
1043             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1044                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1045             */
1046             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
1047            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
1048               Kokkos::nan("1"); /* auto promote the double NaN if needed */
1049            #else
1050               Kokkos::Experimental::nan("1");
1051            #endif
1052           }
1053         }
1054       });
1055     });
1056     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1057   } else { /* different nonzero patterns */
1058     Mat             Z;
1059     KokkosCsrMatrix zcsr;
1060     KernelHandle    kh;
1061     kh.create_spadd_handle(false);
1062     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1063     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1064     zkok = new Mat_SeqAIJKokkos(zcsr);
1065     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1066     PetscCall(MatHeaderReplace(Y,&Z));
1067     kh.destroy_spadd_handle();
1068   }
1069   PetscCall(PetscLogGpuTimeEnd());
1070   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1075 {
1076   Mat_SeqAIJKokkos *akok;
1077   Mat_SeqAIJ       *aseq;
1078 
1079   PetscFunctionBegin;
1080   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1081   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1082   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1083   delete akok;
1084   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1085   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1086   akok->SetUpCOO(aseq);
1087   PetscFunctionReturn(0);
1088 }
1089 
1090 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1091 {
1092   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1093   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1094   PetscCount                  Annz = aseq->nz;
1095   const PetscCountKokkosView& jmap = akok->jmap_d;
1096   const PetscCountKokkosView& perm = akok->perm_d;
1097   MatScalarKokkosView         Aa;
1098   ConstMatScalarKokkosView    kv;
1099   PetscMemType                memtype;
1100 
1101   PetscFunctionBegin;
1102   PetscCall(PetscGetMemType(v,&memtype));
1103   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1104     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1105   } else {
1106     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1107   }
1108 
1109   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1110   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1111 
1112   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1113     PetscScalar sum = 0.0;
1114     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1115     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1116   });
1117 
1118   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1119   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1120   PetscFunctionReturn(0);
1121 }
1122 
1123 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1124 {
1125   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1126   MatScalarKokkosView         Aa;
1127   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1128   PetscInt                    m = A->rmap->n;
1129   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1130 
1131   PetscFunctionBegin;
1132   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1133   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1134     PetscScalar tmp;
1135     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1136       tmp          = Aa(Ai(i));
1137       Aa(Ai(i))    = Aa(Adiag(i));
1138       Aa(Adiag(i)) = tmp;
1139     }
1140   });
1141   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1142   PetscFunctionReturn(0);
1143 }
1144 
1145 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1146 {
1147   PetscFunctionBegin;
1148   PetscCall(MatSeqAIJKokkosSyncHost(A));
1149   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1150   B->offloadmask = PETSC_OFFLOAD_CPU;
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1155 {
1156   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1157 
1158   PetscFunctionBegin;
1159   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1160   A->boundtocpu  = PETSC_FALSE;
1161 
1162   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1163   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1164   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1165   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1166   A->ops->scale                     = MatScale_SeqAIJKokkos;
1167   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1168   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1169   A->ops->mult                      = MatMult_SeqAIJKokkos;
1170   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1171   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1172   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1173   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1174   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1175   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1176   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1177   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1178   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1179   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1180   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1181   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1182   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1183   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1184   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1185   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1186 
1187   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1188   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1193 {
1194   Mat_SeqAIJ *aseq;
1195   PetscInt    i,m,n;
1196 
1197   PetscFunctionBegin;
1198   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1199 
1200   m = akok->nrows();
1201   n = akok->ncols();
1202   PetscCall(MatSetSizes(A,m,n,m,n));
1203   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1204 
1205   /* Set up data structures of A as a MATSEQAIJ */
1206   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1207   aseq = (Mat_SeqAIJ*)(A)->data;
1208 
1209   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1210   akok->j_dual.sync_host();
1211 
1212   aseq->i            = akok->i_host_data();
1213   aseq->j            = akok->j_host_data();
1214   aseq->a            = akok->a_host_data();
1215   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1216   aseq->singlemalloc = PETSC_FALSE;
1217   aseq->free_a       = PETSC_FALSE;
1218   aseq->free_ij      = PETSC_FALSE;
1219   aseq->nz           = akok->nnz();
1220   aseq->maxnz        = aseq->nz;
1221 
1222   PetscCall(PetscMalloc1(m,&aseq->imax));
1223   PetscCall(PetscMalloc1(m,&aseq->ilen));
1224   for (i=0; i<m; i++) {
1225     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1226   }
1227 
1228   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1229   akok->nonzerostate = A->nonzerostate;
1230   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1231   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1232   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1233   PetscFunctionReturn(0);
1234 }
1235 
1236 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1237 
1238    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1239  */
1240 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1241 {
1242   PetscFunctionBegin;
1243   PetscCall(MatCreate(comm,A));
1244   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1245   PetscFunctionReturn(0);
1246 }
1247 
1248 /* --------------------------------------------------------------------------------*/
1249 /*@C
1250    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1251    (the default parallel PETSc format). This matrix will ultimately be handled by
1252    Kokkos for calculations. For good matrix
1253    assembly performance the user should preallocate the matrix storage by setting
1254    the parameter nz (or the array nnz).  By setting these parameters accurately,
1255    performance during matrix assembly can be increased by more than a factor of 50.
1256 
1257    Collective
1258 
1259    Input Parameters:
1260 +  comm - MPI communicator, set to PETSC_COMM_SELF
1261 .  m - number of rows
1262 .  n - number of columns
1263 .  nz - number of nonzeros per row (same for all rows)
1264 -  nnz - array containing the number of nonzeros in the various rows
1265          (possibly different for each row) or NULL
1266 
1267    Output Parameter:
1268 .  A - the matrix
1269 
1270    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1271    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1272    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1273 
1274    Notes:
1275    If nnz is given then nz is ignored
1276 
1277    The AIJ format (also called the Yale sparse matrix format or
1278    compressed row storage), is fully compatible with standard Fortran 77
1279    storage.  That is, the stored row and column indices can begin at
1280    either one (as in Fortran) or zero.  See the users' manual for details.
1281 
1282    Specify the preallocated storage with either nz or nnz (not both).
1283    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1284    allocation.  For large problems you MUST preallocate memory or you
1285    will get TERRIBLE performance, see the users' manual chapter on matrices.
1286 
1287    By default, this format uses inodes (identical nodes) when possible, to
1288    improve numerical efficiency of matrix-vector products and solves. We
1289    search for consecutive rows with the same nonzero structure, thereby
1290    reusing matrix information to achieve increased efficiency.
1291 
1292    Level: intermediate
1293 
1294 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1295 @*/
1296 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1297 {
1298   PetscFunctionBegin;
1299   PetscCall(PetscKokkosInitializeCheck());
1300   PetscCall(MatCreate(comm,A));
1301   PetscCall(MatSetSizes(*A,m,n,m,n));
1302   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1303   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1304   PetscFunctionReturn(0);
1305 }
1306 
1307 typedef Kokkos::TeamPolicy<>::member_type team_member;
1308 //
1309 // This factorization exploits block diagonal matrices with "Nf" (not used).
1310 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1311 //
1312 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1313 {
1314   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1315   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1316   IS                 isrow = b->row,isicol = b->icol;
1317   const PetscInt     *r_h,*ic_h;
1318   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1319   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1320   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1321   PetscBool          row_identity,col_identity;
1322   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1323 
1324   PetscFunctionBegin;
1325   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1326   PetscCall(MatIsStructurallySymmetric(A,&row_identity));
1327   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1328   PetscCall(ISGetIndices(isrow,&r_h));
1329   PetscCall(ISGetIndices(isicol,&ic_h));
1330   PetscCall(ISGetSize(isicol,&nc));
1331   PetscCall(PetscLogGpuTimeBegin());
1332   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1333   {
1334 #define KOKKOS_SHARED_LEVEL 1
1335     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1336     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1337     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1338     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1339     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1340     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1341     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1342     size_t flops_h = 0.0;
1343     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1344     Kokkos::View<size_t> d_flops_k ("flops");
1345     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1346     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1347     Kokkos::deep_copy (d_flops_k, h_flops_k);
1348     Kokkos::deep_copy (d_r_k, h_r_k);
1349     Kokkos::deep_copy (d_ic_k, h_ic_k);
1350     // Fill A --> fact
1351     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1352         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1353         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1354         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1355         // zero rows of B
1356         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1357             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1358             PetscScalar *baL = ba_d + bi_d[rowb];
1359             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1360             /* zero (unfactored row) */
1361             for (int j=0;j<nzbL;j++) baL[j] = 0;
1362             for (int j=0;j<nzbU;j++) baU[j] = 0;
1363           });
1364         // copy A into B
1365         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1366             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1367             const PetscScalar *av    = aa_d + ai_d[rowa];
1368             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1369             /* load in initial (unfactored row) */
1370             for (int j=0;j<nza;j++) {
1371               PetscInt    colb = ic[ajtmp[j]];
1372               PetscScalar vala = av[j];
1373               if (colb == rowb) {
1374                 *(ba_d + bdiag_d[rowb]) = vala;
1375               } else {
1376                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1377                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1378                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1379                 for (int j=0; j<nz ; j++) {
1380                   if (pbj[j] == colb) {
1381                     pba[j] = vala;
1382                     set++;
1383                     break;
1384                   }
1385                 }
1386                #if !defined(PETSC_HAVE_SYCL)
1387                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1388                #endif
1389               }
1390             }
1391           });
1392       });
1393     Kokkos::fence();
1394 
1395     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1396         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1397         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1398         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1399         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1400         const PetscInt  start = field*nloc, end = start + nloc;
1401         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1402         // A22 panel update for each row A(1,:) and col A(:,1)
1403         for (int ii=start; ii<end-1; ii++) {
1404           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1405           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1406           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1407           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1408           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1409               PetscInt kIdx = j*Ni + field_block_idx;
1410               if (kIdx >= nzUi) /* void */ ;
1411               else {
1412                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1413                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1414                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1415                 size_t         st_idx;
1416                 // find and do L(k,i) = A(:k,i) / A(i,i)
1417                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1418                 // get column, there has got to be a better way
1419                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1420                     if (pjL[j] == ii) {
1421                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1422                       idx = j; // output
1423                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1424                     }
1425                 }, st_idx);
1426                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1427 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1428                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1429 #endif
1430                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1431                 // U(i+1,:end)
1432                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1433                       PetscScalar Uij = baUi[uiIdx];
1434                       PetscInt    col = bjUi[uiIdx];
1435                       if (col==myk) {
1436                         // A_kk = A_kk - L_ki * U_ij(k)
1437                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1438                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1439                       } else {
1440                         PetscScalar    *start, *end, *pAkjv=NULL;
1441                         PetscInt       high, low;
1442                         const PetscInt *startj;
1443                         if (col<myk) { // L
1444                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1445                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1446                           start = pLki+1; // start at pLki+1, A22(myk,1)
1447                           startj= bj_d + bi_d[myk] + idx;
1448                           end   = ba_d + bi_d[myk+1];
1449                         } else {
1450                           PetscInt idx = bdiag_d[myk+1]+1;
1451                           start = ba_d + idx;
1452                           startj= bj_d + idx;
1453                           end   = ba_d + bdiag_d[myk];
1454                         }
1455                         // search for 'col', use bisection search - TODO
1456                         low  = 0;
1457                         high = (PetscInt)(end-start);
1458                         while (high-low > 5) {
1459                           int t = (low+high)/2;
1460                           if (startj[t] > col) high = t;
1461                           else                 low  = t;
1462                         }
1463                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1464                           if (startj[pAkjv-start] == col) break;
1465                         }
1466 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1467                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1468 #endif
1469                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1470                       }
1471                     });
1472               }
1473             });
1474           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1475           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1476         } /* endof for (i=0; i<n; i++) { */
1477         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1478       });
1479     Kokkos::fence();
1480     Kokkos::deep_copy (h_flops_k, d_flops_k);
1481     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1482     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1483         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1484         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1485         /* Invert diagonal for simpler triangular solves */
1486         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1487             int i = start + outer_index*Ni + lg_rank%Ni;
1488             if (i < end) {
1489               PetscScalar *pv = ba_d + bdiag_d[i];
1490               *pv = 1.0/(*pv);
1491             }
1492           });
1493       });
1494   }
1495   PetscCall(PetscLogGpuTimeEnd());
1496   PetscCall(ISRestoreIndices(isicol,&ic_h));
1497   PetscCall(ISRestoreIndices(isrow,&r_h));
1498 
1499   PetscCall(ISIdentity(isrow,&row_identity));
1500   PetscCall(ISIdentity(isicol,&col_identity));
1501   if (b->inode.size) {
1502     B->ops->solve = MatSolve_SeqAIJ_Inode;
1503   } else if (row_identity && col_identity) {
1504     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1505   } else {
1506     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1507   }
1508   B->offloadmask = PETSC_OFFLOAD_GPU;
1509   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1510   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1511   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1512   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1513   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1514   B->assembled              = PETSC_TRUE;
1515   B->preallocated           = PETSC_TRUE;
1516 
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1521 {
1522   PetscFunctionBegin;
1523   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1524   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1529 {
1530   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1531 
1532   PetscFunctionBegin;
1533   if (!factors->sptrsv_symbolic_completed) {
1534     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1535     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1536     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1537   }
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 /* Check if we need to update factors etc for transpose solve */
1542 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1543 {
1544   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1545   MatColIdxType             n = A->rmap->n;
1546 
1547   PetscFunctionBegin;
1548   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1549     /* Update L^T and do sptrsv symbolic */
1550     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1551     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1552     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1553     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1554 
1555     KokkosKernels::Impl::transpose_matrix<
1556       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1557       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1558       MatRowMapKokkosView,DefaultExecutionSpace>(
1559         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1560         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1561 
1562     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1563       We have to sort the indices, until KK provides finer control options.
1564     */
1565     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1566       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1567         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1568 
1569     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1570 
1571     /* Update U^T and do sptrsv symbolic */
1572     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1573     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1574     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1575     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1576 
1577     KokkosKernels::Impl::transpose_matrix<
1578       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1579       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1580       MatRowMapKokkosView,DefaultExecutionSpace>(
1581         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1582         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1583 
1584     /* Sort indices. See comments above */
1585     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1586       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1587         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1588 
1589     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1590     factors->transpose_updated = PETSC_TRUE;
1591   }
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 /* Solve Ax = b, with A = LU */
1596 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1597 {
1598   ConstPetscScalarKokkosView     bv;
1599   PetscScalarKokkosView          xv;
1600   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1601 
1602   PetscFunctionBegin;
1603   PetscCall(PetscLogGpuTimeBegin());
1604   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1605   PetscCall(VecGetKokkosView(b,&bv));
1606   PetscCall(VecGetKokkosViewWrite(x,&xv));
1607   /* Solve L tmpv = b */
1608   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1609   /* Solve Ux = tmpv */
1610   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1611   PetscCall(VecRestoreKokkosView(b,&bv));
1612   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1613   PetscCall(PetscLogGpuTimeEnd());
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 /* Solve A^T x = b, where A^T = U^T L^T */
1618 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1619 {
1620   ConstPetscScalarKokkosView     bv;
1621   PetscScalarKokkosView          xv;
1622   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1623 
1624   PetscFunctionBegin;
1625   PetscCall(PetscLogGpuTimeBegin());
1626   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1627   PetscCall(VecGetKokkosView(b,&bv));
1628   PetscCall(VecGetKokkosViewWrite(x,&xv));
1629   /* Solve U^T tmpv = b */
1630   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1631 
1632   /* Solve L^T x = tmpv */
1633   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1634   PetscCall(VecRestoreKokkosView(b,&bv));
1635   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1636   PetscCall(PetscLogGpuTimeEnd());
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1641 {
1642   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1643   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1644   PetscInt                       fill_lev = info->levels;
1645 
1646   PetscFunctionBegin;
1647   PetscCall(PetscLogGpuTimeBegin());
1648   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1649 
1650   auto a_d = aijkok->a_dual.view_device();
1651   auto i_d = aijkok->i_dual.view_device();
1652   auto j_d = aijkok->j_dual.view_device();
1653 
1654   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1655 
1656   B->assembled                       = PETSC_TRUE;
1657   B->preallocated                    = PETSC_TRUE;
1658   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1659   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1660   B->ops->matsolve                   = NULL;
1661   B->ops->matsolvetranspose          = NULL;
1662   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1663 
1664   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1665   factors->transpose_updated         = PETSC_FALSE;
1666   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1667   /* TODO: log flops, but how to know that? */
1668   PetscCall(PetscLogGpuTimeEnd());
1669   PetscFunctionReturn(0);
1670 }
1671 
1672 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1673 {
1674   Mat_SeqAIJKokkos               *aijkok;
1675   Mat_SeqAIJ                     *b;
1676   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1677   PetscInt                       fill_lev = info->levels;
1678   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1679   PetscInt                       n = A->rmap->n;
1680 
1681   PetscFunctionBegin;
1682   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1683   /* Rebuild factors */
1684   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1685   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1686 
1687   /* Create a spiluk handle and then do symbolic factorization */
1688   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1689   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1690 
1691   auto spiluk_handle = factors->kh.get_spiluk_handle();
1692 
1693   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1694   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1695   Kokkos::realloc(factors->iU_d,n+1);
1696   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1697 
1698   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1699   auto i_d = aijkok->i_dual.view_device();
1700   auto j_d = aijkok->j_dual.view_device();
1701   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1702   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1703 
1704   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1705   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1706   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1707   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1708 
1709   /* TODO: add options to select sptrsv algorithms */
1710   /* Create sptrsv handles for L, U and their transpose */
1711  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1712   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1713  #else
1714   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1715  #endif
1716 
1717   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1718   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1719   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1720   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1721 
1722   /* Fill fields of the factor matrix B */
1723   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1724   b     = (Mat_SeqAIJ*)B->data;
1725   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1726   B->info.fill_ratio_given  = info->fill;
1727   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1728 
1729   B->offloadmask            = PETSC_OFFLOAD_GPU;
1730   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1735 {
1736   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1737   const PetscInt   nrows   = A->rmap->n;
1738 
1739   PetscFunctionBegin;
1740   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1741   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1742   // move B data into Kokkos
1743   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1744   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1745   {
1746     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1747     if (!baijkok->diag_d.extent(0)) {
1748       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1749       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1750       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1751     }
1752   }
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1757 {
1758   PetscFunctionBegin;
1759   *type = MATSOLVERKOKKOS;
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1764 {
1765   PetscFunctionBegin;
1766   *type = MATSOLVERKOKKOSDEVICE;
1767   PetscFunctionReturn(0);
1768 }
1769 
1770 /*MC
1771   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1772   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1773 
1774   Level: beginner
1775 
1776 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1777 M*/
1778 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1779 {
1780   PetscInt       n = A->rmap->n;
1781 
1782   PetscFunctionBegin;
1783   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1784   PetscCall(MatSetSizes(*B,n,n,n,n));
1785   (*B)->factortype = ftype;
1786   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1787   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1788 
1789   if (ftype == MAT_FACTOR_LU) {
1790     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1791     (*B)->canuseordering         = PETSC_TRUE;
1792     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1793   } else if (ftype == MAT_FACTOR_ILU) {
1794     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1795     (*B)->canuseordering         = PETSC_FALSE;
1796     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1797   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1798 
1799   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1800   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1801   PetscFunctionReturn(0);
1802 }
1803 
1804 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1805 {
1806   PetscInt       n = A->rmap->n;
1807 
1808   PetscFunctionBegin;
1809   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1810   PetscCall(MatSetSizes(*B,n,n,n,n));
1811   (*B)->factortype = ftype;
1812   (*B)->canuseordering = PETSC_TRUE;
1813   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1814   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1815 
1816   if (ftype == MAT_FACTOR_LU) {
1817     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1818     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1819   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1820 
1821   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1822   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1827 {
1828   PetscFunctionBegin;
1829   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1830   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1831   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1832   PetscFunctionReturn(0);
1833 }
1834 
1835 /* Utility to print out a KokkosCsrMatrix for debugging */
1836 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1837 {
1838   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1839   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1840   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1841   const PetscInt    *i = iv.data();
1842   const PetscInt    *j = jv.data();
1843   const PetscScalar *a = av.data();
1844   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1845 
1846   PetscFunctionBegin;
1847   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1848   for (PetscInt k=0; k<m; k++) {
1849     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1850     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1851       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1852     }
1853     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1854   }
1855   PetscFunctionReturn(0);
1856 }
1857