xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 2e16c0ce58b3a4ec287cbc0a0807bfb0a0fa5ac9)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   Mat_SeqAIJ       *aijseq;
29   Mat_SeqAIJKokkos *aijkok;
30 
31   PetscFunctionBegin;
32   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34 
35   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37 
38   /* If aijkok does not exist, we just copy i, j to device.
39      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40      In both cases, we build a new aijkok structure.
41   */
42   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43     delete aijkok;
44     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45     A->spptr = aijkok;
46   }
47 
48   if (aijkok->device_mat_d.data()) {
49     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50   }
51   PetscFunctionReturn(0);
52 }
53 
54 /* Sync CSR data to device if not yet */
55 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62   if (aijkok->a_dual.need_sync_device()) {
63     aijkok->a_dual.sync_device();
64     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
65     aijkok->hermitian_updated = PETSC_FALSE;
66   }
67   PetscFunctionReturn(0);
68 }
69 
70 /* Mark the CSR data on device as modified */
71 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
72 {
73   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74 
75   PetscFunctionBegin;
76   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
77   aijkok->a_dual.clear_sync_state();
78   aijkok->a_dual.modify_device();
79   aijkok->transpose_updated = PETSC_FALSE;
80   aijkok->hermitian_updated = PETSC_FALSE;
81   PetscCall(MatSeqAIJInvalidateDiagonal(A));
82   PetscCall(PetscObjectStateIncrease((PetscObject)A));
83   PetscFunctionReturn(0);
84 }
85 
86 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87 {
88   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89 
90   PetscFunctionBegin;
91   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
92   /* We do not expect one needs factors on host  */
93   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95   aijkok->a_dual.sync_host();
96   PetscFunctionReturn(0);
97 }
98 
99 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100 {
101   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102 
103   PetscFunctionBegin;
104   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
105     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
106     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
107     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
108   */
109   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110     aijkok->a_dual.sync_host();
111     *array = aijkok->a_dual.view_host().data();
112   } else { /* Happens when calling MatSetValues on a newly created matrix */
113     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114   }
115   PetscFunctionReturn(0);
116 }
117 
118 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119 {
120   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121 
122   PetscFunctionBegin;
123   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124   PetscFunctionReturn(0);
125 }
126 
127 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128 {
129   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130 
131   PetscFunctionBegin;
132   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133     aijkok->a_dual.sync_host();
134     *array = aijkok->a_dual.view_host().data();
135   } else {
136     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142 {
143   PetscFunctionBegin;
144   *array = NULL;
145   PetscFunctionReturn(0);
146 }
147 
148 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149 {
150   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151 
152   PetscFunctionBegin;
153   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154     *array = aijkok->a_dual.view_host().data();
155   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
156     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
157   }
158   PetscFunctionReturn(0);
159 }
160 
161 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162 {
163   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164 
165   PetscFunctionBegin;
166   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167     aijkok->a_dual.clear_sync_state();
168     aijkok->a_dual.modify_host();
169   }
170   PetscFunctionReturn(0);
171 }
172 
173 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
174 {
175   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176 
177   PetscFunctionBegin;
178   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
179 
180   if (i) *i = aijkok->i_device_data();
181   if (j) *j = aijkok->j_device_data();
182   if (a) {
183     aijkok->a_dual.sync_device();
184     *a = aijkok->a_device_data();
185   }
186   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
187   PetscFunctionReturn(0);
188 }
189 
190 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192 {
193   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195 
196   PetscFunctionBegin;
197   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200   PetscFunctionReturn(0);
201 }
202 
203 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205 {
206   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207 
208   PetscFunctionBegin;
209   if (aijkok && aijkok->device_mat_d.data()) {
210     *d_mat = aijkok->device_mat_d.data();
211   } else {
212     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213     *d_mat  = NULL;
214   }
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the transpose on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220 {
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
227     PetscCallCXX(aijkok->a_dual.sync_device());
228     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
230     aijkok->transpose_updated = PETSC_TRUE;
231   }
232   *csrmatT = &aijkok->csrmatT;
233   PetscFunctionReturn(0);
234 }
235 
236 /* Generate the Hermitian on device and cache it internally */
237 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238 {
239   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240 
241   PetscFunctionBegin;
242   PetscCall(PetscLogGpuTimeBegin());
243   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
245     PetscCallCXX(aijkok->a_dual.sync_device());
246     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
247     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248    #if defined(PETSC_USE_COMPLEX)
249     const auto& a = aijkok->csrmatH.values;
250     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251    #endif
252     aijkok->hermitian_updated = PETSC_TRUE;
253   }
254   *csrmatH = &aijkok->csrmatH;
255   PetscCall(PetscLogGpuTimeEnd());
256   PetscFunctionReturn(0);
257 }
258 
259 /* y = A x */
260 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
261 {
262   Mat_SeqAIJKokkos                 *aijkok;
263   ConstPetscScalarKokkosView       xv;
264   PetscScalarKokkosView            yv;
265 
266   PetscFunctionBegin;
267   PetscCall(PetscLogGpuTimeBegin());
268   PetscCall(MatSeqAIJKokkosSyncDevice(A));
269   PetscCall(VecGetKokkosView(xx,&xv));
270   PetscCall(VecGetKokkosViewWrite(yy,&yv));
271   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
273   PetscCall(VecRestoreKokkosView(xx,&xv));
274   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
276   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
277   PetscCall(PetscLogGpuTimeEnd());
278   PetscFunctionReturn(0);
279 }
280 
281 /* y = A^T x */
282 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
283 {
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   PetscCall(PetscLogGpuTimeBegin());
292   PetscCall(MatSeqAIJKokkosSyncDevice(A));
293   PetscCall(VecGetKokkosView(xx,&xv));
294   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295   if (A->form_explicit_transpose) {
296     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297     mode = "N";
298   } else {
299     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300     csrmat = &aijkok->csrmat;
301     mode = "T";
302   }
303   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
304   PetscCall(VecRestoreKokkosView(xx,&xv));
305   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
306   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
307   PetscCall(PetscLogGpuTimeEnd());
308   PetscFunctionReturn(0);
309 }
310 
311 /* y = A^H x */
312 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
313 {
314   Mat_SeqAIJKokkos                 *aijkok;
315   const char                       *mode;
316   ConstPetscScalarKokkosView       xv;
317   PetscScalarKokkosView            yv;
318   KokkosCsrMatrix                  *csrmat;
319 
320   PetscFunctionBegin;
321   PetscCall(PetscLogGpuTimeBegin());
322   PetscCall(MatSeqAIJKokkosSyncDevice(A));
323   PetscCall(VecGetKokkosView(xx,&xv));
324   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325   if (A->form_explicit_transpose) {
326     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327     mode = "N";
328   } else {
329     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330     csrmat = &aijkok->csrmat;
331     mode = "C";
332   }
333   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
334   PetscCall(VecRestoreKokkosView(xx,&xv));
335   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
336   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
337   PetscCall(PetscLogGpuTimeEnd());
338   PetscFunctionReturn(0);
339 }
340 
341 /* z = A x + y */
342 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
343 {
344   Mat_SeqAIJKokkos                 *aijkok;
345   ConstPetscScalarKokkosView       xv,yv;
346   PetscScalarKokkosView            zv;
347 
348   PetscFunctionBegin;
349   PetscCall(PetscLogGpuTimeBegin());
350   PetscCall(MatSeqAIJKokkosSyncDevice(A));
351   PetscCall(VecGetKokkosView(xx,&xv));
352   PetscCall(VecGetKokkosView(yy,&yv));
353   PetscCall(VecGetKokkosViewWrite(zz,&zv));
354   if (zz != yy) Kokkos::deep_copy(zv,yv);
355   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
357   PetscCall(VecRestoreKokkosView(xx,&xv));
358   PetscCall(VecRestoreKokkosView(yy,&yv));
359   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
360   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
361   PetscCall(PetscLogGpuTimeEnd());
362   PetscFunctionReturn(0);
363 }
364 
365 /* z = A^T x + y */
366 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
367 {
368   Mat_SeqAIJKokkos                 *aijkok;
369   const char                       *mode;
370   ConstPetscScalarKokkosView       xv,yv;
371   PetscScalarKokkosView            zv;
372   KokkosCsrMatrix                  *csrmat;
373 
374   PetscFunctionBegin;
375   PetscCall(PetscLogGpuTimeBegin());
376   PetscCall(MatSeqAIJKokkosSyncDevice(A));
377   PetscCall(VecGetKokkosView(xx,&xv));
378   PetscCall(VecGetKokkosView(yy,&yv));
379   PetscCall(VecGetKokkosViewWrite(zz,&zv));
380   if (zz != yy) Kokkos::deep_copy(zv,yv);
381   if (A->form_explicit_transpose) {
382     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383     mode = "N";
384   } else {
385     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386     csrmat = &aijkok->csrmat;
387     mode = "T";
388   }
389   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
390   PetscCall(VecRestoreKokkosView(xx,&xv));
391   PetscCall(VecRestoreKokkosView(yy,&yv));
392   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
393   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
394   PetscCall(PetscLogGpuTimeEnd());
395   PetscFunctionReturn(0);
396 }
397 
398 /* z = A^H x + y */
399 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
400 {
401   Mat_SeqAIJKokkos                 *aijkok;
402   const char                       *mode;
403   ConstPetscScalarKokkosView       xv,yv;
404   PetscScalarKokkosView            zv;
405   KokkosCsrMatrix                  *csrmat;
406 
407   PetscFunctionBegin;
408   PetscCall(PetscLogGpuTimeBegin());
409   PetscCall(MatSeqAIJKokkosSyncDevice(A));
410   PetscCall(VecGetKokkosView(xx,&xv));
411   PetscCall(VecGetKokkosView(yy,&yv));
412   PetscCall(VecGetKokkosViewWrite(zz,&zv));
413   if (zz != yy) Kokkos::deep_copy(zv,yv);
414   if (A->form_explicit_transpose) {
415     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416     mode = "N";
417   } else {
418     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419     csrmat = &aijkok->csrmat;
420     mode = "C";
421   }
422   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
423   PetscCall(VecRestoreKokkosView(xx,&xv));
424   PetscCall(VecRestoreKokkosView(yy,&yv));
425   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
426   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
427   PetscCall(PetscLogGpuTimeEnd());
428   PetscFunctionReturn(0);
429 }
430 
431 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432 {
433   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434 
435   PetscFunctionBegin;
436   switch (op) {
437     case MAT_FORM_EXPLICIT_TRANSPOSE:
438       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
439       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440       A->form_explicit_transpose = flg;
441       break;
442     default:
443       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444       break;
445   }
446   PetscFunctionReturn(0);
447 }
448 
449 /* Depending on reuse, either build a new mat, or use the existing mat */
450 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
451 {
452   Mat_SeqAIJ       *aseq;
453 
454   PetscFunctionBegin;
455   PetscCall(PetscKokkosInitializeCheck());
456   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
457     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
458   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
459     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
461     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
462     PetscCall(PetscFree(A->defaultvectype));
463     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
464     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
465     PetscCall(MatSetOps_SeqAIJKokkos(A));
466     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
468       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
470     }
471   }
472   PetscFunctionReturn(0);
473 }
474 
475 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477  */
478 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
479 {
480   Mat_SeqAIJ            *bseq;
481   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482   Mat                   mat;
483 
484   PetscFunctionBegin;
485   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
486   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487   mat  = *B;
488   if (A->assembled) {
489     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492     /* Now copy values to B if needed */
493     if (dupOption == MAT_COPY_VALUES) {
494       if (akok->a_dual.need_sync_device()) {
495         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496         bkok->a_dual.modify_host();
497       } else { /* If device has the latest data, we only copy data on device */
498         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499         bkok->a_dual.modify_device();
500       }
501     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503       bkok->a_dual.modify_host();
504     }
505     mat->spptr = bkok;
506   }
507 
508   PetscCall(PetscFree(mat->defaultvectype));
509   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
510   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
511   PetscCall(MatSetOps_SeqAIJKokkos(mat));
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
516 {
517   Mat               At;
518   KokkosCsrMatrix   *internT;
519   Mat_SeqAIJKokkos  *atkok,*bkok;
520 
521   PetscFunctionBegin;
522   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
523   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
524     /* Deep copy internT, as we want to isolate the internal transpose */
525     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
526     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
527     if (reuse == MAT_INITIAL_MATRIX) *B = At;
528     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
529   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
530     if ((*B)->assembled) {
531       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
532       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
533       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
534     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
535       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
536       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
537       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
538       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
539       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
540     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
541   }
542   PetscFunctionReturn(0);
543 }
544 
545 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
546 {
547   Mat_SeqAIJKokkos           *aijkok;
548 
549   PetscFunctionBegin;
550   if (A->factortype == MAT_FACTOR_NONE) {
551     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
552     delete aijkok;
553   } else {
554     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
555   }
556   A->spptr = NULL;
557   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
558   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
559   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
560   PetscCall(MatDestroy_SeqAIJ(A));
561   PetscFunctionReturn(0);
562 }
563 
564 /*MC
565    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
566 
567    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
568 
569    Options Database Keys:
570 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
571 
572   Level: beginner
573 
574 .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
575 M*/
576 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
577 {
578   PetscFunctionBegin;
579   PetscCall(PetscKokkosInitializeCheck());
580   PetscCall(MatCreate_SeqAIJ(A));
581   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
582   PetscFunctionReturn(0);
583 }
584 
585 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
586 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
587 {
588   Mat_SeqAIJ                   *a,*b;
589   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
590   MatScalarKokkosView          aa,ba,ca;
591   MatRowMapKokkosView          ai,bi,ci;
592   MatColIdxKokkosView          aj,bj,cj;
593   PetscInt                     m,n,nnz,aN;
594 
595   PetscFunctionBegin;
596   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
597   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
598   PetscValidPointer(C,4);
599   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
600   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
601   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
602   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
603 
604   PetscCall(MatSeqAIJKokkosSyncDevice(A));
605   PetscCall(MatSeqAIJKokkosSyncDevice(B));
606   a    = static_cast<Mat_SeqAIJ*>(A->data);
607   b    = static_cast<Mat_SeqAIJ*>(B->data);
608   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
609   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
610   aa   = akok->a_dual.view_device();
611   ai   = akok->i_dual.view_device();
612   ba   = bkok->a_dual.view_device();
613   bi   = bkok->i_dual.view_device();
614   m    = A->rmap->n; /* M, N and nnz of C */
615   n    = A->cmap->n + B->cmap->n;
616   nnz  = a->nz + b->nz;
617   aN   = A->cmap->n; /* N of A */
618   if (reuse == MAT_INITIAL_MATRIX) {
619     aj = akok->j_dual.view_device();
620     bj = bkok->j_dual.view_device();
621     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
622     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
623     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
624     ca = ca_dual.view_device();
625     ci = ci_dual.view_device();
626     cj = cj_dual.view_device();
627 
628     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
629     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
630       PetscInt i = t.league_rank(); /* row i */
631       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
632 
633       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
634         ci(i) = coffset;
635         if (i == m-1) ci(m) = ai(m) + bi(m);
636       });
637 
638       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
639         if (k < alen) {
640           ca(coffset+k) = aa(ai(i)+k);
641           cj(coffset+k) = aj(ai(i)+k);
642         } else {
643           ca(coffset+k) = ba(bi(i)+k-alen);
644           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
645         }
646       });
647     });
648     ca_dual.modify_device();
649     ci_dual.modify_device();
650     cj_dual.modify_device();
651     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
652     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
653   } else if (reuse == MAT_REUSE_MATRIX) {
654     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
655     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
656     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
657     ca   = ckok->a_dual.view_device();
658     ci   = ckok->i_dual.view_device();
659 
660     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
661       PetscInt i = t.league_rank(); /* row i */
662       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
663       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
664         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
665         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
666       });
667     });
668     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
669   }
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
674 {
675   PetscFunctionBegin;
676   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
681 {
682   Mat_Product                    *product = C->product;
683   Mat                            A,B;
684   bool                           transA,transB; /* use bool, since KK needs this type */
685   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
686   Mat_SeqAIJ                     *c;
687   MatProductData_SeqAIJKokkos    *pdata;
688   KokkosCsrMatrix                *csrmatA,*csrmatB;
689 
690   PetscFunctionBegin;
691   MatCheckProduct(C,1);
692   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
693   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
694 
695   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
696     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
697     PetscFunctionReturn(0);
698   }
699 
700   switch (product->type) {
701     case MATPRODUCT_AB:  transA = false; transB = false; break;
702     case MATPRODUCT_AtB: transA = true;  transB = false; break;
703     case MATPRODUCT_ABt: transA = false; transB = true;  break;
704     default:
705       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
706   }
707 
708   A     = product->A;
709   B     = product->B;
710   PetscCall(MatSeqAIJKokkosSyncDevice(A));
711   PetscCall(MatSeqAIJKokkosSyncDevice(B));
712   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
713   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
714   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
715 
716   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
717 
718   csrmatA = &akok->csrmat;
719   csrmatB = &bkok->csrmat;
720 
721   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
722   if (transA) {
723     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
724     transA = false;
725   }
726 
727   if (transB) {
728     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
729     transB = false;
730   }
731   PetscCall(PetscLogGpuTimeBegin());
732   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
733   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
734   PetscCall(PetscLogGpuTimeEnd());
735   PetscCall(MatSeqAIJKokkosModifyDevice(C));
736   /* shorter version of MatAssemblyEnd_SeqAIJ */
737   c = (Mat_SeqAIJ*)C->data;
738   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
739   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
740   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
741   c->reallocs         = 0;
742   C->info.mallocs     = 0;
743   C->info.nz_unneeded = 0;
744   C->assembled        = C->was_assembled = PETSC_TRUE;
745   C->num_ass++;
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
750 {
751   Mat_Product                    *product = C->product;
752   MatProductType                 ptype;
753   Mat                            A,B;
754   bool                           transA,transB;
755   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
756   MatProductData_SeqAIJKokkos    *pdata;
757   MPI_Comm                       comm;
758   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
759 
760   PetscFunctionBegin;
761   MatCheckProduct(C,1);
762   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
763   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
764   A       = product->A;
765   B       = product->B;
766   PetscCall(MatSeqAIJKokkosSyncDevice(A));
767   PetscCall(MatSeqAIJKokkosSyncDevice(B));
768   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
769   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
770   csrmatA = &akok->csrmat;
771   csrmatB = &bkok->csrmat;
772 
773   ptype   = product->type;
774   switch (ptype) {
775     case MATPRODUCT_AB:  transA = false; transB = false; break;
776     case MATPRODUCT_AtB: transA = true;  transB = false; break;
777     case MATPRODUCT_ABt: transA = false; transB = true;  break;
778     default:
779       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
780   }
781 
782   product->data = pdata = new MatProductData_SeqAIJKokkos();
783   pdata->kh.set_team_work_size(16);
784   pdata->kh.set_dynamic_scheduling(true);
785   pdata->reusesym = product->api_user;
786 
787   /* TODO: add command line options to select spgemm algorithms */
788   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
789   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
790      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
791    */
792   pdata->kh.create_spgemm_handle(spgemm_alg);
793 
794   PetscCall(PetscLogGpuTimeBegin());
795   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
796   if (transA) {
797     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
798     transA = false;
799   }
800 
801   if (transB) {
802     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
803     transB = false;
804   }
805 
806   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
808     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
809     calling new Mat_SeqAIJKokkos().
810     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
811   */
812   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
813   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
814   PetscCall(PetscLogGpuTimeEnd());
815 
816   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
817   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
818   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
819   PetscFunctionReturn(0);
820 }
821 
822 /* handles sparse matrix matrix ops */
823 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
824 {
825   Mat_Product    *product = mat->product;
826   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827 
828   PetscFunctionBegin;
829   MatCheckProduct(mat,1);
830   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
831   if (product->type == MATPRODUCT_ABC) {
832     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
833   }
834   if (Biskok && Ciskok) {
835     switch (product->type) {
836     case MATPRODUCT_AB:
837     case MATPRODUCT_AtB:
838     case MATPRODUCT_ABt:
839       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840       break;
841     case MATPRODUCT_PtAP:
842     case MATPRODUCT_RARt:
843     case MATPRODUCT_ABC:
844       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845       break;
846     default:
847       break;
848     }
849   } else { /* fallback for AIJ */
850     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
851   }
852   PetscFunctionReturn(0);
853 }
854 
855 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856 {
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   PetscCall(PetscLogGpuTimeBegin());
861   PetscCall(MatSeqAIJKokkosSyncDevice(A));
862   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864   PetscCall(MatSeqAIJKokkosModifyDevice(A));
865   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
866   PetscCall(PetscLogGpuTimeEnd());
867   PetscFunctionReturn(0);
868 }
869 
870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871 {
872   Mat_SeqAIJKokkos *aijkok;
873 
874   PetscFunctionBegin;
875   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
876   if (aijkok) { /* Only zero the device if data is already there */
877     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
878     PetscCall(MatSeqAIJKokkosModifyDevice(A));
879   } else { /* Might be preallocated but not assembled */
880     PetscCall(MatZeroEntries_SeqAIJ(A));
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)
886 {
887   Mat_SeqAIJ                   *aijseq;
888   Mat_SeqAIJKokkos             *aijkok;
889   PetscInt                     n;
890   PetscScalarKokkosView        xv;
891 
892   PetscFunctionBegin;
893   PetscCall(VecGetLocalSize(x,&n));
894   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
895   PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
896 
897   PetscCall(MatSeqAIJKokkosSyncDevice(A));
898   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
899 
900   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
901     PetscCall(MatMarkDiagonal_SeqAIJ(A));
902     aijseq = static_cast<Mat_SeqAIJ*>(A->data);
903     aijkok->SetDiagonal(aijseq->diag);
904   }
905 
906   const auto& Aa = aijkok->a_dual.view_device();
907   const auto& Ai = aijkok->i_dual.view_device();
908   const auto& Adiag = aijkok->diag_dual.view_device();
909 
910   PetscCall(VecGetKokkosViewWrite(x,&xv));
911   Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
912     if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i));
913     else xv(i) = 0;
914   });
915   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
916   PetscFunctionReturn(0);
917 }
918 
919 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
920 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
921 {
922   Mat_SeqAIJKokkos   *aijkok;
923 
924   PetscFunctionBegin;
925   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
926   PetscValidPointer(kv,2);
927   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
928   PetscCall(MatSeqAIJKokkosSyncDevice(A));
929   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
930   *kv    = aijkok->a_dual.view_device();
931   PetscFunctionReturn(0);
932 }
933 
934 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
935 {
936   PetscFunctionBegin;
937   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
938   PetscValidPointer(kv,2);
939   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
940   PetscFunctionReturn(0);
941 }
942 
943 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
944 {
945   Mat_SeqAIJKokkos   *aijkok;
946 
947   PetscFunctionBegin;
948   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
949   PetscValidPointer(kv,2);
950   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
951   PetscCall(MatSeqAIJKokkosSyncDevice(A));
952   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
953   *kv    = aijkok->a_dual.view_device();
954   PetscFunctionReturn(0);
955 }
956 
957 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
958 {
959   PetscFunctionBegin;
960   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
961   PetscValidPointer(kv,2);
962   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
963   PetscCall(MatSeqAIJKokkosModifyDevice(A));
964   PetscFunctionReturn(0);
965 }
966 
967 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
968 {
969   Mat_SeqAIJKokkos   *aijkok;
970 
971   PetscFunctionBegin;
972   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
973   PetscValidPointer(kv,2);
974   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
975   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
976   *kv    = aijkok->a_dual.view_device();
977   PetscFunctionReturn(0);
978 }
979 
980 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
981 {
982   PetscFunctionBegin;
983   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
984   PetscValidPointer(kv,2);
985   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
986   PetscCall(MatSeqAIJKokkosModifyDevice(A));
987   PetscFunctionReturn(0);
988 }
989 
990 /* Computes Y += alpha X */
991 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
992 {
993   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
994   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
995   ConstMatScalarKokkosView   Xa;
996   MatScalarKokkosView        Ya;
997 
998   PetscFunctionBegin;
999   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
1000   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
1001   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1002   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1003   PetscCall(PetscLogGpuTimeBegin());
1004 
1005   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1006     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1007     PetscBool e;
1008     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
1009     if (e) {
1010       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
1011       if (e) pattern = SAME_NONZERO_PATTERN;
1012     }
1013   }
1014 
1015   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1016     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1017     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1018     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1019   */
1020   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
1021   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
1022   Xa   = xkok->a_dual.view_device();
1023   Ya   = ykok->a_dual.view_device();
1024 
1025   if (pattern == SAME_NONZERO_PATTERN) {
1026     KokkosBlas::axpy(alpha,Xa,Ya);
1027     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1028   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1029     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1030     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1031 
1032     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1033       PetscInt i = t.league_rank(); /* row i */
1034       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1035         PetscInt p,q = Yi(i);
1036         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1037           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1038           if (Xj(p) == Yj(q)) { /* Found it */
1039             Ya(q) += alpha * Xa(p);
1040             q++;
1041           } else {
1042             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1043                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1044             */
1045             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
1046            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
1047               Kokkos::nan("1"); /* auto promote the double NaN if needed */
1048            #else
1049               Kokkos::Experimental::nan("1");
1050            #endif
1051           }
1052         }
1053       });
1054     });
1055     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1056   } else { /* different nonzero patterns */
1057     Mat             Z;
1058     KokkosCsrMatrix zcsr;
1059     KernelHandle    kh;
1060     kh.create_spadd_handle(false);
1061     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1062     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1063     zkok = new Mat_SeqAIJKokkos(zcsr);
1064     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1065     PetscCall(MatHeaderReplace(Y,&Z));
1066     kh.destroy_spadd_handle();
1067   }
1068   PetscCall(PetscLogGpuTimeEnd());
1069   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1074 {
1075   Mat_SeqAIJKokkos *akok;
1076   Mat_SeqAIJ       *aseq;
1077 
1078   PetscFunctionBegin;
1079   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1080   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1081   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1082   delete akok;
1083   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1084   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1085   akok->SetUpCOO(aseq);
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1090 {
1091   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1092   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1093   PetscCount                  Annz = aseq->nz;
1094   const PetscCountKokkosView& jmap = akok->jmap_d;
1095   const PetscCountKokkosView& perm = akok->perm_d;
1096   MatScalarKokkosView         Aa;
1097   ConstMatScalarKokkosView    kv;
1098   PetscMemType                memtype;
1099 
1100   PetscFunctionBegin;
1101   PetscCall(PetscGetMemType(v,&memtype));
1102   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1103     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1104   } else {
1105     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1106   }
1107 
1108   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1109   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
1110 
1111   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1112     PetscScalar sum = 0.0;
1113     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1114     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1115   });
1116 
1117   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1118   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
1119   PetscFunctionReturn(0);
1120 }
1121 
1122 PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1123 {
1124   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1125   MatScalarKokkosView         Aa;
1126   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1127   PetscInt                    m = A->rmap->n;
1128   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1129 
1130   PetscFunctionBegin;
1131   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1132   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1133     PetscScalar tmp;
1134     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1135       tmp          = Aa(Ai(i));
1136       Aa(Ai(i))    = Aa(Adiag(i));
1137       Aa(Adiag(i)) = tmp;
1138     }
1139   });
1140   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1145 {
1146   PetscFunctionBegin;
1147   PetscCall(MatSeqAIJKokkosSyncHost(A));
1148   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
1149   B->offloadmask = PETSC_OFFLOAD_CPU;
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1154 {
1155   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1156 
1157   PetscFunctionBegin;
1158   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1159   A->boundtocpu  = PETSC_FALSE;
1160 
1161   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1162   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1163   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1164   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1165   A->ops->scale                     = MatScale_SeqAIJKokkos;
1166   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1167   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1168   A->ops->mult                      = MatMult_SeqAIJKokkos;
1169   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1170   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1171   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1172   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1173   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1174   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1175   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1176   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1177   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1178   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1179   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1180   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1181   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1182   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1183   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1184   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1185 
1186   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1187   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1192 {
1193   Mat_SeqAIJ *aseq;
1194   PetscInt    i,m,n;
1195 
1196   PetscFunctionBegin;
1197   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1198 
1199   m = akok->nrows();
1200   n = akok->ncols();
1201   PetscCall(MatSetSizes(A,m,n,m,n));
1202   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1203 
1204   /* Set up data structures of A as a MATSEQAIJ */
1205   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1206   aseq = (Mat_SeqAIJ*)(A)->data;
1207 
1208   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1209   akok->j_dual.sync_host();
1210 
1211   aseq->i            = akok->i_host_data();
1212   aseq->j            = akok->j_host_data();
1213   aseq->a            = akok->a_host_data();
1214   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1215   aseq->singlemalloc = PETSC_FALSE;
1216   aseq->free_a       = PETSC_FALSE;
1217   aseq->free_ij      = PETSC_FALSE;
1218   aseq->nz           = akok->nnz();
1219   aseq->maxnz        = aseq->nz;
1220 
1221   PetscCall(PetscMalloc1(m,&aseq->imax));
1222   PetscCall(PetscMalloc1(m,&aseq->ilen));
1223   for (i=0; i<m; i++) {
1224     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1225   }
1226 
1227   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1228   akok->nonzerostate = A->nonzerostate;
1229   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1230   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1231   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1236 
1237    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1238  */
1239 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1240 {
1241   PetscFunctionBegin;
1242   PetscCall(MatCreate(comm,A));
1243   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
1244   PetscFunctionReturn(0);
1245 }
1246 
1247 /* --------------------------------------------------------------------------------*/
1248 /*@C
1249    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1250    (the default parallel PETSc format). This matrix will ultimately be handled by
1251    Kokkos for calculations. For good matrix
1252    assembly performance the user should preallocate the matrix storage by setting
1253    the parameter nz (or the array nnz).  By setting these parameters accurately,
1254    performance during matrix assembly can be increased by more than a factor of 50.
1255 
1256    Collective
1257 
1258    Input Parameters:
1259 +  comm - MPI communicator, set to PETSC_COMM_SELF
1260 .  m - number of rows
1261 .  n - number of columns
1262 .  nz - number of nonzeros per row (same for all rows)
1263 -  nnz - array containing the number of nonzeros in the various rows
1264          (possibly different for each row) or NULL
1265 
1266    Output Parameter:
1267 .  A - the matrix
1268 
1269    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1270    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1271    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1272 
1273    Notes:
1274    If nnz is given then nz is ignored
1275 
1276    The AIJ format (also called the Yale sparse matrix format or
1277    compressed row storage), is fully compatible with standard Fortran 77
1278    storage.  That is, the stored row and column indices can begin at
1279    either one (as in Fortran) or zero.  See the users' manual for details.
1280 
1281    Specify the preallocated storage with either nz or nnz (not both).
1282    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1283    allocation.  For large problems you MUST preallocate memory or you
1284    will get TERRIBLE performance, see the users' manual chapter on matrices.
1285 
1286    By default, this format uses inodes (identical nodes) when possible, to
1287    improve numerical efficiency of matrix-vector products and solves. We
1288    search for consecutive rows with the same nonzero structure, thereby
1289    reusing matrix information to achieve increased efficiency.
1290 
1291    Level: intermediate
1292 
1293 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
1294 @*/
1295 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1296 {
1297   PetscFunctionBegin;
1298   PetscCall(PetscKokkosInitializeCheck());
1299   PetscCall(MatCreate(comm,A));
1300   PetscCall(MatSetSizes(*A,m,n,m,n));
1301   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
1302   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
1303   PetscFunctionReturn(0);
1304 }
1305 
1306 typedef Kokkos::TeamPolicy<>::member_type team_member;
1307 //
1308 // This factorization exploits block diagonal matrices with "Nf" (not used).
1309 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1310 //
1311 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1312 {
1313   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1314   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1315   IS                 isrow = b->row,isicol = b->icol;
1316   const PetscInt     *r_h,*ic_h;
1317   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1318   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1319   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1320   PetscBool          row_identity,col_identity;
1321   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1322 
1323   PetscFunctionBegin;
1324   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1325   PetscCall(MatIsStructurallySymmetric(A,&row_identity));
1326   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1327   PetscCall(ISGetIndices(isrow,&r_h));
1328   PetscCall(ISGetIndices(isicol,&ic_h));
1329   PetscCall(ISGetSize(isicol,&nc));
1330   PetscCall(PetscLogGpuTimeBegin());
1331   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1332   {
1333 #define KOKKOS_SHARED_LEVEL 1
1334     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1335     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1336     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1337     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1338     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1339     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1340     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1341     size_t flops_h = 0.0;
1342     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1343     Kokkos::View<size_t> d_flops_k ("flops");
1344     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1345     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1346     Kokkos::deep_copy (d_flops_k, h_flops_k);
1347     Kokkos::deep_copy (d_r_k, h_r_k);
1348     Kokkos::deep_copy (d_ic_k, h_ic_k);
1349     // Fill A --> fact
1350     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1351         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1352         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1353         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1354         // zero rows of B
1355         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1356             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1357             PetscScalar *baL = ba_d + bi_d[rowb];
1358             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1359             /* zero (unfactored row) */
1360             for (int j=0;j<nzbL;j++) baL[j] = 0;
1361             for (int j=0;j<nzbU;j++) baU[j] = 0;
1362           });
1363         // copy A into B
1364         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1365             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1366             const PetscScalar *av    = aa_d + ai_d[rowa];
1367             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1368             /* load in initial (unfactored row) */
1369             for (int j=0;j<nza;j++) {
1370               PetscInt    colb = ic[ajtmp[j]];
1371               PetscScalar vala = av[j];
1372               if (colb == rowb) {
1373                 *(ba_d + bdiag_d[rowb]) = vala;
1374               } else {
1375                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1376                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1377                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1378                 for (int j=0; j<nz ; j++) {
1379                   if (pbj[j] == colb) {
1380                     pba[j] = vala;
1381                     set++;
1382                     break;
1383                   }
1384                 }
1385                #if !defined(PETSC_HAVE_SYCL)
1386                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1387                #endif
1388               }
1389             }
1390           });
1391       });
1392     Kokkos::fence();
1393 
1394     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1395         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1396         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1397         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1398         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1399         const PetscInt  start = field*nloc, end = start + nloc;
1400         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1401         // A22 panel update for each row A(1,:) and col A(:,1)
1402         for (int ii=start; ii<end-1; ii++) {
1403           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1404           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1405           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1406           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1407           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1408               PetscInt kIdx = j*Ni + field_block_idx;
1409               if (kIdx >= nzUi) /* void */ ;
1410               else {
1411                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1412                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1413                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1414                 size_t         st_idx;
1415                 // find and do L(k,i) = A(:k,i) / A(i,i)
1416                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1417                 // get column, there has got to be a better way
1418                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1419                     if (pjL[j] == ii) {
1420                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1421                       idx = j; // output
1422                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1423                     }
1424                 }, st_idx);
1425                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1426 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1427                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1428 #endif
1429                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1430                 // U(i+1,:end)
1431                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1432                       PetscScalar Uij = baUi[uiIdx];
1433                       PetscInt    col = bjUi[uiIdx];
1434                       if (col==myk) {
1435                         // A_kk = A_kk - L_ki * U_ij(k)
1436                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1437                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1438                       } else {
1439                         PetscScalar    *start, *end, *pAkjv=NULL;
1440                         PetscInt       high, low;
1441                         const PetscInt *startj;
1442                         if (col<myk) { // L
1443                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1444                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1445                           start = pLki+1; // start at pLki+1, A22(myk,1)
1446                           startj= bj_d + bi_d[myk] + idx;
1447                           end   = ba_d + bi_d[myk+1];
1448                         } else {
1449                           PetscInt idx = bdiag_d[myk+1]+1;
1450                           start = ba_d + idx;
1451                           startj= bj_d + idx;
1452                           end   = ba_d + bdiag_d[myk];
1453                         }
1454                         // search for 'col', use bisection search - TODO
1455                         low  = 0;
1456                         high = (PetscInt)(end-start);
1457                         while (high-low > 5) {
1458                           int t = (low+high)/2;
1459                           if (startj[t] > col) high = t;
1460                           else                 low  = t;
1461                         }
1462                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1463                           if (startj[pAkjv-start] == col) break;
1464                         }
1465 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1466                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1467 #endif
1468                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1469                       }
1470                     });
1471               }
1472             });
1473           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1474           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1475         } /* endof for (i=0; i<n; i++) { */
1476         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1477       });
1478     Kokkos::fence();
1479     Kokkos::deep_copy (h_flops_k, d_flops_k);
1480     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
1481     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1482         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1483         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1484         /* Invert diagonal for simpler triangular solves */
1485         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1486             int i = start + outer_index*Ni + lg_rank%Ni;
1487             if (i < end) {
1488               PetscScalar *pv = ba_d + bdiag_d[i];
1489               *pv = 1.0/(*pv);
1490             }
1491           });
1492       });
1493   }
1494   PetscCall(PetscLogGpuTimeEnd());
1495   PetscCall(ISRestoreIndices(isicol,&ic_h));
1496   PetscCall(ISRestoreIndices(isrow,&r_h));
1497 
1498   PetscCall(ISIdentity(isrow,&row_identity));
1499   PetscCall(ISIdentity(isicol,&col_identity));
1500   if (b->inode.size) {
1501     B->ops->solve = MatSolve_SeqAIJ_Inode;
1502   } else if (row_identity && col_identity) {
1503     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1504   } else {
1505     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1506   }
1507   B->offloadmask = PETSC_OFFLOAD_GPU;
1508   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
1509   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1510   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1511   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1512   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1513   B->assembled              = PETSC_TRUE;
1514   B->preallocated           = PETSC_TRUE;
1515 
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1520 {
1521   PetscFunctionBegin;
1522   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1523   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1528 {
1529   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1530 
1531   PetscFunctionBegin;
1532   if (!factors->sptrsv_symbolic_completed) {
1533     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1534     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1535     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1536   }
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 /* Check if we need to update factors etc for transpose solve */
1541 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1542 {
1543   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1544   MatColIdxType             n = A->rmap->n;
1545 
1546   PetscFunctionBegin;
1547   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1548     /* Update L^T and do sptrsv symbolic */
1549     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1550     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1551     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1552     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1553 
1554     KokkosKernels::Impl::transpose_matrix<
1555       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1556       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1557       MatRowMapKokkosView,DefaultExecutionSpace>(
1558         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1559         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1560 
1561     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1562       We have to sort the indices, until KK provides finer control options.
1563     */
1564     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1565       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1566         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1567 
1568     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1569 
1570     /* Update U^T and do sptrsv symbolic */
1571     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1572     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1573     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1574     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1575 
1576     KokkosKernels::Impl::transpose_matrix<
1577       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1578       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1579       MatRowMapKokkosView,DefaultExecutionSpace>(
1580         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1581         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1582 
1583     /* Sort indices. See comments above */
1584     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1585       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1586         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1587 
1588     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1589     factors->transpose_updated = PETSC_TRUE;
1590   }
1591   PetscFunctionReturn(0);
1592 }
1593 
1594 /* Solve Ax = b, with A = LU */
1595 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1596 {
1597   ConstPetscScalarKokkosView     bv;
1598   PetscScalarKokkosView          xv;
1599   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1600 
1601   PetscFunctionBegin;
1602   PetscCall(PetscLogGpuTimeBegin());
1603   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
1604   PetscCall(VecGetKokkosView(b,&bv));
1605   PetscCall(VecGetKokkosViewWrite(x,&xv));
1606   /* Solve L tmpv = b */
1607   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1608   /* Solve Ux = tmpv */
1609   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1610   PetscCall(VecRestoreKokkosView(b,&bv));
1611   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1612   PetscCall(PetscLogGpuTimeEnd());
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 /* Solve A^T x = b, where A^T = U^T L^T */
1617 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1618 {
1619   ConstPetscScalarKokkosView     bv;
1620   PetscScalarKokkosView          xv;
1621   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1622 
1623   PetscFunctionBegin;
1624   PetscCall(PetscLogGpuTimeBegin());
1625   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
1626   PetscCall(VecGetKokkosView(b,&bv));
1627   PetscCall(VecGetKokkosViewWrite(x,&xv));
1628   /* Solve U^T tmpv = b */
1629   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1630 
1631   /* Solve L^T x = tmpv */
1632   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1633   PetscCall(VecRestoreKokkosView(b,&bv));
1634   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
1635   PetscCall(PetscLogGpuTimeEnd());
1636   PetscFunctionReturn(0);
1637 }
1638 
1639 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1640 {
1641   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1642   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1643   PetscInt                       fill_lev = info->levels;
1644 
1645   PetscFunctionBegin;
1646   PetscCall(PetscLogGpuTimeBegin());
1647   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1648 
1649   auto a_d = aijkok->a_dual.view_device();
1650   auto i_d = aijkok->i_dual.view_device();
1651   auto j_d = aijkok->j_dual.view_device();
1652 
1653   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1654 
1655   B->assembled                       = PETSC_TRUE;
1656   B->preallocated                    = PETSC_TRUE;
1657   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1658   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1659   B->ops->matsolve                   = NULL;
1660   B->ops->matsolvetranspose          = NULL;
1661   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1662 
1663   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1664   factors->transpose_updated         = PETSC_FALSE;
1665   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1666   /* TODO: log flops, but how to know that? */
1667   PetscCall(PetscLogGpuTimeEnd());
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1672 {
1673   Mat_SeqAIJKokkos               *aijkok;
1674   Mat_SeqAIJ                     *b;
1675   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1676   PetscInt                       fill_lev = info->levels;
1677   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1678   PetscInt                       n = A->rmap->n;
1679 
1680   PetscFunctionBegin;
1681   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1682   /* Rebuild factors */
1683   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1684   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1685 
1686   /* Create a spiluk handle and then do symbolic factorization */
1687   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1688   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1689 
1690   auto spiluk_handle = factors->kh.get_spiluk_handle();
1691 
1692   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1693   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1694   Kokkos::realloc(factors->iU_d,n+1);
1695   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1696 
1697   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1698   auto i_d = aijkok->i_dual.view_device();
1699   auto j_d = aijkok->j_dual.view_device();
1700   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1701   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1702 
1703   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1704   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1705   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1706   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1707 
1708   /* TODO: add options to select sptrsv algorithms */
1709   /* Create sptrsv handles for L, U and their transpose */
1710  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1711   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1712  #else
1713   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1714  #endif
1715 
1716   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1717   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1718   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1719   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1720 
1721   /* Fill fields of the factor matrix B */
1722   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
1723   b     = (Mat_SeqAIJ*)B->data;
1724   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1725   B->info.fill_ratio_given  = info->fill;
1726   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1727 
1728   B->offloadmask            = PETSC_OFFLOAD_GPU;
1729   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1730   PetscFunctionReturn(0);
1731 }
1732 
1733 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1734 {
1735   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1736   const PetscInt   nrows   = A->rmap->n;
1737 
1738   PetscFunctionBegin;
1739   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
1740   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1741   // move B data into Kokkos
1742   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1743   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
1744   {
1745     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1746     if (!baijkok->diag_d.extent(0)) {
1747       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1748       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1749       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1750     }
1751   }
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1756 {
1757   PetscFunctionBegin;
1758   *type = MATSOLVERKOKKOS;
1759   PetscFunctionReturn(0);
1760 }
1761 
1762 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1763 {
1764   PetscFunctionBegin;
1765   *type = MATSOLVERKOKKOSDEVICE;
1766   PetscFunctionReturn(0);
1767 }
1768 
1769 /*MC
1770   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1771   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1772 
1773   Level: beginner
1774 
1775 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1776 M*/
1777 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1778 {
1779   PetscInt       n = A->rmap->n;
1780 
1781   PetscFunctionBegin;
1782   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1783   PetscCall(MatSetSizes(*B,n,n,n,n));
1784   (*B)->factortype = ftype;
1785   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1786   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1787 
1788   if (ftype == MAT_FACTOR_LU) {
1789     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1790     (*B)->canuseordering         = PETSC_TRUE;
1791     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1792   } else if (ftype == MAT_FACTOR_ILU) {
1793     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1794     (*B)->canuseordering         = PETSC_FALSE;
1795     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1796   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1797 
1798   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1799   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1804 {
1805   PetscInt       n = A->rmap->n;
1806 
1807   PetscFunctionBegin;
1808   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1809   PetscCall(MatSetSizes(*B,n,n,n,n));
1810   (*B)->factortype = ftype;
1811   (*B)->canuseordering = PETSC_TRUE;
1812   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1813   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1814 
1815   if (ftype == MAT_FACTOR_LU) {
1816     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1817     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1818   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1819 
1820   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1821   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1826 {
1827   PetscFunctionBegin;
1828   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1829   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1830   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
1831   PetscFunctionReturn(0);
1832 }
1833 
1834 /* Utility to print out a KokkosCsrMatrix for debugging */
1835 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1836 {
1837   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1838   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1839   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1840   const PetscInt    *i = iv.data();
1841   const PetscInt    *j = jv.data();
1842   const PetscScalar *a = av.data();
1843   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1844 
1845   PetscFunctionBegin;
1846   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1847   for (PetscInt k=0; k<m; k++) {
1848     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1849     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1850       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1851     }
1852     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1853   }
1854   PetscFunctionReturn(0);
1855 }
1856