xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 394ed5eb51aeb2fd4d5be41f3bc40ab0c0d7d8f0)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
254   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* y = A^T x */
265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
266 {
267   PetscErrorCode                   ierr;
268   Mat_SeqAIJKokkos                 *aijkok;
269   const char                       *mode;
270   ConstPetscScalarKokkosView       xv;
271   PetscScalarKokkosView            yv;
272   KokkosCsrMatrix                  *csrmat;
273 
274   PetscFunctionBegin;
275   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
276   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
279   if (A->form_explicit_transpose) {
280     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281     mode = "N";
282   } else {
283     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284     csrmat = &aijkok->csrmat;
285     mode = "T";
286   }
287   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
290   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /* y = A^H x */
296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
297 {
298   PetscErrorCode                   ierr;
299   Mat_SeqAIJKokkos                 *aijkok;
300   const char                       *mode;
301   ConstPetscScalarKokkosView       xv;
302   PetscScalarKokkosView            yv;
303   KokkosCsrMatrix                  *csrmat;
304 
305   PetscFunctionBegin;
306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
307   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
310   if (A->form_explicit_transpose) {
311     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312     mode = "N";
313   } else {
314     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315     csrmat = &aijkok->csrmat;
316     mode = "C";
317   }
318   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
321   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 /* z = A x + y */
327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
328 {
329   PetscErrorCode                   ierr;
330   Mat_SeqAIJKokkos                 *aijkok;
331   ConstPetscScalarKokkosView       xv,yv;
332   PetscScalarKokkosView            zv;
333 
334   PetscFunctionBegin;
335   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
336   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
340   if (zz != yy) Kokkos::deep_copy(zv,yv);
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A^T x + y */
352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
353 {
354   PetscErrorCode                   ierr;
355   Mat_SeqAIJKokkos                 *aijkok;
356   const char                       *mode;
357   ConstPetscScalarKokkosView       xv,yv;
358   PetscScalarKokkosView            zv;
359   KokkosCsrMatrix                  *csrmat;
360 
361   PetscFunctionBegin;
362   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
363   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
367   if (zz != yy) Kokkos::deep_copy(zv,yv);
368   if (A->form_explicit_transpose) {
369     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370     mode = "N";
371   } else {
372     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373     csrmat = &aijkok->csrmat;
374     mode = "T";
375   }
376   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
380   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 /* z = A^H x + y */
386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
387 {
388   PetscErrorCode                   ierr;
389   Mat_SeqAIJKokkos                 *aijkok;
390   const char                       *mode;
391   ConstPetscScalarKokkosView       xv,yv;
392   PetscScalarKokkosView            zv;
393   KokkosCsrMatrix                  *csrmat;
394 
395   PetscFunctionBegin;
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
401   if (zz != yy) Kokkos::deep_copy(zv,yv);
402   if (A->form_explicit_transpose) {
403     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407     csrmat = &aijkok->csrmat;
408     mode = "C";
409   }
410   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
414   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420 {
421   PetscErrorCode            ierr;
422   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423 
424   PetscFunctionBegin;
425   switch (op) {
426     case MAT_FORM_EXPLICIT_TRANSPOSE:
427       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429       A->form_explicit_transpose = flg;
430       break;
431     default:
432       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433       break;
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 /* Depending on reuse, either build a new mat, or use the existing mat */
439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
440 {
441   PetscErrorCode   ierr;
442   Mat_SeqAIJ       *aseq;
443 
444   PetscFunctionBegin;
445   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
448   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
458       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
460     }
461   }
462   PetscFunctionReturn(0);
463 }
464 
465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467  */
468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
469 {
470   PetscErrorCode        ierr;
471   Mat_SeqAIJ            *bseq;
472   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473   Mat                   mat;
474 
475   PetscFunctionBegin;
476   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478   mat  = *B;
479   if (A->assembled) {
480     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483     /* Now copy values to B if needed */
484     if (dupOption == MAT_COPY_VALUES) {
485       if (akok->a_dual.need_sync_device()) {
486         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487         bkok->a_dual.modify_host();
488       } else { /* If device has the latest data, we only copy data on device */
489         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490         bkok->a_dual.modify_device();
491       }
492     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494       bkok->a_dual.modify_host();
495     }
496     mat->spptr = bkok;
497   }
498 
499   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
507 {
508   PetscErrorCode    ierr;
509   Mat               At;
510   KokkosCsrMatrix   *internT;
511   Mat_SeqAIJKokkos  *atkok,*bkok;
512 
513   PetscFunctionBegin;
514   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
515   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516     /* Deep copy internT, as we want to isolate the internal transpose */
517     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
518     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
519     if (reuse == MAT_INITIAL_MATRIX) *B = At;
520     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
521   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
522     if ((*B)->assembled) {
523       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
524       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
525       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
526     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
527       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
528       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
529       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
530       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
531       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
532     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
538 {
539   PetscErrorCode             ierr;
540   Mat_SeqAIJKokkos           *aijkok;
541 
542   PetscFunctionBegin;
543   if (A->factortype == MAT_FACTOR_NONE) {
544     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545     delete aijkok;
546   } else {
547     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
548   }
549   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
552   A->spptr = NULL;
553   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
558 {
559   PetscErrorCode ierr;
560 
561   PetscFunctionBegin;
562   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
563   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
564   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
565   PetscFunctionReturn(0);
566 }
567 
568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
570 {
571   PetscErrorCode               ierr;
572   Mat_SeqAIJ                   *a,*b;
573   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
574   MatScalarKokkosView          aa,ba,ca;
575   MatRowMapKokkosView          ai,bi,ci;
576   MatColIdxKokkosView          aj,bj,cj;
577   PetscInt                     m,n,nnz,aN;
578 
579   PetscFunctionBegin;
580   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
581   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
582   PetscValidPointer(C,4);
583   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
584   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
585   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
586   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
587 
588   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
589   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
590   a    = static_cast<Mat_SeqAIJ*>(A->data);
591   b    = static_cast<Mat_SeqAIJ*>(B->data);
592   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
593   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
594   aa   = akok->a_dual.view_device();
595   ai   = akok->i_dual.view_device();
596   ba   = bkok->a_dual.view_device();
597   bi   = bkok->i_dual.view_device();
598   m    = A->rmap->n; /* M, N and nnz of C */
599   n    = A->cmap->n + B->cmap->n;
600   nnz  = a->nz + b->nz;
601   aN   = A->cmap->n; /* N of A */
602   if (reuse == MAT_INITIAL_MATRIX) {
603     aj = akok->j_dual.view_device();
604     bj = bkok->j_dual.view_device();
605     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
606     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
607     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
608     ca = ca_dual.view_device();
609     ci = ci_dual.view_device();
610     cj = cj_dual.view_device();
611 
612     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
613     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
614       PetscInt i = t.league_rank(); /* row i */
615       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
616 
617       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618         ci(i) = coffset;
619         if (i == m-1) ci(m) = ai(m) + bi(m);
620       });
621 
622       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
623         if (k < alen) {
624           ca(coffset+k) = aa(ai(i)+k);
625           cj(coffset+k) = aj(ai(i)+k);
626         } else {
627           ca(coffset+k) = ba(bi(i)+k-alen);
628           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
629         }
630       });
631     });
632     ca_dual.modify_device();
633     ci_dual.modify_device();
634     cj_dual.modify_device();
635     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
636     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
637   } else if (reuse == MAT_REUSE_MATRIX) {
638     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
639     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
640     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
641     ca   = ckok->a_dual.view_device();
642     ci   = ckok->i_dual.view_device();
643 
644     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
645       PetscInt i = t.league_rank(); /* row i */
646       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
647       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
648         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
649         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
650       });
651     });
652     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
653   }
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
658 {
659   PetscFunctionBegin;
660   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
661   PetscFunctionReturn(0);
662 }
663 
664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665 {
666   PetscErrorCode                 ierr;
667   Mat_Product                    *product = C->product;
668   Mat                            A,B;
669   bool                           transA,transB; /* use bool, since KK needs this type */
670   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
671   Mat_SeqAIJ                     *c;
672   MatProductData_SeqAIJKokkos    *pdata;
673   KokkosCsrMatrix                *csrmatA,*csrmatB;
674 
675   PetscFunctionBegin;
676   MatCheckProduct(C,1);
677   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
678   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
679 
680   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
681     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
682     PetscFunctionReturn(0);
683   }
684 
685   switch (product->type) {
686     case MATPRODUCT_AB:  transA = false; transB = false; break;
687     case MATPRODUCT_AtB: transA = true;  transB = false; break;
688     case MATPRODUCT_ABt: transA = false; transB = true;  break;
689     default:
690       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
691   }
692 
693   A     = product->A;
694   B     = product->B;
695   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
696   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
697   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
698   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
699   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
700 
701   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
702 
703   csrmatA = &akok->csrmat;
704   csrmatB = &bkok->csrmat;
705 
706   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
707   if (transA) {
708     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
709     transA = false;
710   }
711 
712   if (transB) {
713     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
714     transB = false;
715   }
716   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
717   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
718   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
719   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
720   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
721   /* shorter version of MatAssemblyEnd_SeqAIJ */
722   c = (Mat_SeqAIJ*)C->data;
723   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
724   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
725   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
726   c->reallocs         = 0;
727   C->info.mallocs     = 0;
728   C->info.nz_unneeded = 0;
729   C->assembled        = C->was_assembled = PETSC_TRUE;
730   C->num_ass++;
731   PetscFunctionReturn(0);
732 }
733 
734 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
735 {
736   PetscErrorCode                 ierr;
737   Mat_Product                    *product = C->product;
738   MatProductType                 ptype;
739   Mat                            A,B;
740   bool                           transA,transB;
741   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
742   MatProductData_SeqAIJKokkos    *pdata;
743   MPI_Comm                       comm;
744   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
745 
746   PetscFunctionBegin;
747   MatCheckProduct(C,1);
748   ierr = PetscObjectGetComm((PetscObject)C,&comm);
749   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
750   A       = product->A;
751   B       = product->B;
752   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
753   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
754   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
755   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
756   csrmatA = &akok->csrmat;
757   csrmatB = &bkok->csrmat;
758 
759   ptype   = product->type;
760   switch (ptype) {
761     case MATPRODUCT_AB:  transA = false; transB = false; break;
762     case MATPRODUCT_AtB: transA = true;  transB = false; break;
763     case MATPRODUCT_ABt: transA = false; transB = true;  break;
764     default:
765       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
766   }
767 
768   product->data = pdata = new MatProductData_SeqAIJKokkos();
769   pdata->kh.set_team_work_size(16);
770   pdata->kh.set_dynamic_scheduling(true);
771   pdata->reusesym = product->api_user;
772 
773   /* TODO: add command line options to select spgemm algorithms */
774   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
775   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
776      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
777    */
778   pdata->kh.create_spgemm_handle(spgemm_alg);
779 
780   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
781   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
782   if (transA) {
783     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
784     transA = false;
785   }
786 
787   if (transB) {
788     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
789     transB = false;
790   }
791 
792   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
793   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
794     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
795     calling new Mat_SeqAIJKokkos().
796     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
797   */
798   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
799   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
800   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
801 
802   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
803   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
804   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
805   PetscFunctionReturn(0);
806 }
807 
808 /* handles sparse matrix matrix ops */
809 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
810 {
811   PetscErrorCode ierr;
812   Mat_Product    *product = mat->product;
813   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
814 
815   PetscFunctionBegin;
816   MatCheckProduct(mat,1);
817   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
818   if (product->type == MATPRODUCT_ABC) {
819     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
820   }
821   if (Biskok && Ciskok) {
822     switch (product->type) {
823     case MATPRODUCT_AB:
824     case MATPRODUCT_AtB:
825     case MATPRODUCT_ABt:
826       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
827       break;
828     case MATPRODUCT_PtAP:
829     case MATPRODUCT_RARt:
830     case MATPRODUCT_ABC:
831       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
832       break;
833     default:
834       break;
835     }
836   } else { /* fallback for AIJ */
837     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
838   }
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
843 {
844   PetscErrorCode   ierr;
845   Mat_SeqAIJKokkos *aijkok;
846 
847   PetscFunctionBegin;
848   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
849   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
850   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
851   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
852   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
853   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
854   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
859 {
860   PetscErrorCode   ierr;
861   Mat_SeqAIJKokkos *aijkok;
862 
863   PetscFunctionBegin;
864   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
865   if (aijkok) { /* Only zero the device if data is already there */
866     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
867     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
868   } else { /* Might be preallocated but not assembled */
869     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
870   }
871   PetscFunctionReturn(0);
872 }
873 
874 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
875 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
876 {
877   PetscErrorCode     ierr;
878   Mat_SeqAIJKokkos   *aijkok;
879 
880   PetscFunctionBegin;
881   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
882   PetscValidPointer(kv,2);
883   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
884   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
885   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
886   *kv    = aijkok->a_dual.view_device();
887   PetscFunctionReturn(0);
888 }
889 
890 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
891 {
892   PetscFunctionBegin;
893   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
894   PetscValidPointer(kv,2);
895   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
896   PetscFunctionReturn(0);
897 }
898 
899 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
900 {
901   PetscErrorCode     ierr;
902   Mat_SeqAIJKokkos   *aijkok;
903 
904   PetscFunctionBegin;
905   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
906   PetscValidPointer(kv,2);
907   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
908   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
909   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
910   *kv    = aijkok->a_dual.view_device();
911   PetscFunctionReturn(0);
912 }
913 
914 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
915 {
916   PetscErrorCode     ierr;
917 
918   PetscFunctionBegin;
919   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
920   PetscValidPointer(kv,2);
921   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
922   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
923   PetscFunctionReturn(0);
924 }
925 
926 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
927 {
928   Mat_SeqAIJKokkos   *aijkok;
929 
930   PetscFunctionBegin;
931   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
932   PetscValidPointer(kv,2);
933   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
934   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
935   *kv    = aijkok->a_dual.view_device();
936   PetscFunctionReturn(0);
937 }
938 
939 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
940 {
941   PetscErrorCode     ierr;
942 
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
948   PetscFunctionReturn(0);
949 }
950 
951 /* Computes Y += alpha X */
952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953 {
954   PetscErrorCode             ierr;
955   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
956   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
957   ConstMatScalarKokkosView   Xa;
958   MatScalarKokkosView        Ya;
959 
960   PetscFunctionBegin;
961   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
962   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
963   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
964   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
965   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
966 
967   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
968     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
969     PetscBool e;
970     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
971     if (e) {
972       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
973       if (e) pattern = SAME_NONZERO_PATTERN;
974     }
975   }
976 
977   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
978     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
979     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
980     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
981   */
982   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
983   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
984   Xa   = xkok->a_dual.view_device();
985   Ya   = ykok->a_dual.view_device();
986 
987   if (pattern == SAME_NONZERO_PATTERN) {
988     KokkosBlas::axpy(alpha,Xa,Ya);
989     ierr = MatSeqAIJKokkosModifyDevice(Y);
990   } else if (pattern == SUBSET_NONZERO_PATTERN) {
991     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
992     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
993 
994     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
995       PetscInt i = t.league_rank(); /* row i */
996       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
997         PetscInt p,q = Yi(i);
998         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
999           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1000           if (Xj(p) == Yj(q)) { /* Found it */
1001             Ya(q) += alpha * Xa(p);
1002             q++;
1003           } else {
1004             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1005                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1006             */
1007             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1008           }
1009         }
1010       });
1011     });
1012     ierr = MatSeqAIJKokkosModifyDevice(Y);
1013   } else { /* different nonzero patterns */
1014     Mat             Z;
1015     KokkosCsrMatrix zcsr;
1016     KernelHandle    kh;
1017     kh.create_spadd_handle(false);
1018     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1019     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1020     zkok = new Mat_SeqAIJKokkos(zcsr);
1021     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1022     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1023     kh.destroy_spadd_handle();
1024   }
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1031 {
1032   PetscErrorCode            ierr;
1033   Mat                       newmat;
1034   Mat_SeqAIJKokkos          *akok;
1035   Mat_SeqAIJ                *aseq;
1036 
1037   PetscFunctionBegin;
1038   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1039   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1040   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1041   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1042   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1043   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1044   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1045   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1046   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1047   akok->SetUpCOO(aseq);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1052 {
1053   PetscErrorCode              ierr;
1054   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1055   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1056   PetscCount                  Annz = aseq->nz;
1057   const PetscCountKokkosView& jmap = akok->jmap_d;
1058   const PetscCountKokkosView& perm = akok->perm_d;
1059   MatScalarKokkosView         Aa;
1060   ConstMatScalarKokkosView    kv;
1061   PetscMemType                memtype;
1062 
1063   PetscFunctionBegin;
1064   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
1065   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1066   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1067     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1068   } else {
1069     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1070   }
1071 
1072   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1073   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
1074 
1075   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1076     for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1077   });
1078 
1079   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1080   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
1081   PetscFunctionReturn(0);
1082 }
1083 
1084 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1085 {
1086   PetscErrorCode ierr;
1087 
1088   PetscFunctionBegin;
1089   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1090   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1091   B->offloadmask = PETSC_OFFLOAD_CPU;
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1096 {
1097   PetscErrorCode     ierr;
1098   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1099 
1100   PetscFunctionBegin;
1101   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1102   A->boundtocpu  = PETSC_FALSE;
1103 
1104   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1105   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1106   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1107   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1108   A->ops->scale                     = MatScale_SeqAIJKokkos;
1109   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1110   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1111   A->ops->mult                      = MatMult_SeqAIJKokkos;
1112   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1113   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1114   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1115   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1116   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1117   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1118   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1119   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1120   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1121   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1122   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1123   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1124   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1125   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1126 
1127   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1128   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1129   PetscFunctionReturn(0);
1130 }
1131 
1132 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1133 {
1134   PetscErrorCode ierr;
1135   Mat_SeqAIJ     *aseq;
1136   PetscInt       i,m,n;
1137 
1138   PetscFunctionBegin;
1139   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1140 
1141   m    = akok->nrows();
1142   n    = akok->ncols();
1143   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1144   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1145 
1146   /* Set up data structures of A as a MATSEQAIJ */
1147   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1148   aseq = (Mat_SeqAIJ*)(A)->data;
1149 
1150   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1151   akok->j_dual.sync_host();
1152 
1153   aseq->i            = akok->i_host_data();
1154   aseq->j            = akok->j_host_data();
1155   aseq->a            = akok->a_host_data();
1156   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1157   aseq->singlemalloc = PETSC_FALSE;
1158   aseq->free_a       = PETSC_FALSE;
1159   aseq->free_ij      = PETSC_FALSE;
1160   aseq->nz           = akok->nnz();
1161   aseq->maxnz        = aseq->nz;
1162 
1163   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1164   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1165   for (i=0; i<m; i++) {
1166     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1167   }
1168 
1169   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1170   akok->nonzerostate = A->nonzerostate;
1171   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1172   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1173   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1178 
1179    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1180  */
1181 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1182 {
1183   PetscErrorCode ierr;
1184 
1185   PetscFunctionBegin;
1186   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1187   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 /* --------------------------------------------------------------------------------*/
1192 /*@C
1193    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1194    (the default parallel PETSc format). This matrix will ultimately be handled by
1195    Kokkos for calculations. For good matrix
1196    assembly performance the user should preallocate the matrix storage by setting
1197    the parameter nz (or the array nnz).  By setting these parameters accurately,
1198    performance during matrix assembly can be increased by more than a factor of 50.
1199 
1200    Collective
1201 
1202    Input Parameters:
1203 +  comm - MPI communicator, set to PETSC_COMM_SELF
1204 .  m - number of rows
1205 .  n - number of columns
1206 .  nz - number of nonzeros per row (same for all rows)
1207 -  nnz - array containing the number of nonzeros in the various rows
1208          (possibly different for each row) or NULL
1209 
1210    Output Parameter:
1211 .  A - the matrix
1212 
1213    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1214    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1215    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1216 
1217    Notes:
1218    If nnz is given then nz is ignored
1219 
1220    The AIJ format (also called the Yale sparse matrix format or
1221    compressed row storage), is fully compatible with standard Fortran 77
1222    storage.  That is, the stored row and column indices can begin at
1223    either one (as in Fortran) or zero.  See the users' manual for details.
1224 
1225    Specify the preallocated storage with either nz or nnz (not both).
1226    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1227    allocation.  For large problems you MUST preallocate memory or you
1228    will get TERRIBLE performance, see the users' manual chapter on matrices.
1229 
1230    By default, this format uses inodes (identical nodes) when possible, to
1231    improve numerical efficiency of matrix-vector products and solves. We
1232    search for consecutive rows with the same nonzero structure, thereby
1233    reusing matrix information to achieve increased efficiency.
1234 
1235    Level: intermediate
1236 
1237 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1238 @*/
1239 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1240 {
1241   PetscErrorCode ierr;
1242 
1243   PetscFunctionBegin;
1244   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1245   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1246   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1247   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1248   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 typedef Kokkos::TeamPolicy<>::member_type team_member;
1253 //
1254 // This factorization exploits block diagonal matrices with "Nf" (not used).
1255 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1256 //
1257 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1258 {
1259   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1260   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1261   IS                 isrow = b->row,isicol = b->icol;
1262   PetscErrorCode     ierr;
1263   const PetscInt     *r_h,*ic_h;
1264   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1265   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1266   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1267   PetscBool          row_identity,col_identity;
1268   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1269 
1270   PetscFunctionBegin;
1271   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1272   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1273   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1274   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1275   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1276   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1277   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1278   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1279   {
1280 #define KOKKOS_SHARED_LEVEL 1
1281     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1282     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1283     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1284     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1285     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1286     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1287     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1288     size_t flops_h = 0.0;
1289     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1290     Kokkos::View<size_t> d_flops_k ("flops");
1291     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1292     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1293     Kokkos::deep_copy (d_flops_k, h_flops_k);
1294     Kokkos::deep_copy (d_r_k, h_r_k);
1295     Kokkos::deep_copy (d_ic_k, h_ic_k);
1296     // Fill A --> fact
1297     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1298         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1299         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1300         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1301         // zero rows of B
1302         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1303             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1304             PetscScalar *baL = ba_d + bi_d[rowb];
1305             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1306             /* zero (unfactored row) */
1307             for (int j=0;j<nzbL;j++) baL[j] = 0;
1308             for (int j=0;j<nzbU;j++) baU[j] = 0;
1309           });
1310         // copy A into B
1311         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1312             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1313             const PetscScalar *av    = aa_d + ai_d[rowa];
1314             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1315             /* load in initial (unfactored row) */
1316             for (int j=0;j<nza;j++) {
1317               PetscInt    colb = ic[ajtmp[j]];
1318               PetscScalar vala = av[j];
1319               if (colb == rowb) {
1320                 *(ba_d + bdiag_d[rowb]) = vala;
1321               } else {
1322                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1323                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1324                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1325                 for (int j=0; j<nz ; j++) {
1326                   if (pbj[j] == colb) {
1327                     pba[j] = vala;
1328                     set++;
1329                     break;
1330                   }
1331                 }
1332                #if !defined(PETSC_HAVE_SYCL)
1333                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1334                #endif
1335               }
1336             }
1337           });
1338       });
1339     Kokkos::fence();
1340 
1341     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1342         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1343         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1344         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1345         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1346         const PetscInt  start = field*nloc, end = start + nloc;
1347         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1348         // A22 panel update for each row A(1,:) and col A(:,1)
1349         for (int ii=start; ii<end-1; ii++) {
1350           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1351           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1352           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1353           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1354           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1355               PetscInt kIdx = j*Ni + field_block_idx;
1356               if (kIdx >= nzUi) /* void */ ;
1357               else {
1358                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1359                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1360                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1361                 size_t         st_idx;
1362                 // find and do L(k,i) = A(:k,i) / A(i,i)
1363                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1364                 // get column, there has got to be a better way
1365                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1366                     if (pjL[j] == ii) {
1367                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1368                       idx = j; // output
1369                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1370                     }
1371                 }, st_idx);
1372                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1373 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1374                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1375 #endif
1376                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1377                 // U(i+1,:end)
1378                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1379                       PetscScalar Uij = baUi[uiIdx];
1380                       PetscInt    col = bjUi[uiIdx];
1381                       if (col==myk) {
1382                         // A_kk = A_kk - L_ki * U_ij(k)
1383                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1384                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1385                       } else {
1386                         PetscScalar    *start, *end, *pAkjv=NULL;
1387                         PetscInt       high, low;
1388                         const PetscInt *startj;
1389                         if (col<myk) { // L
1390                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1391                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1392                           start = pLki+1; // start at pLki+1, A22(myk,1)
1393                           startj= bj_d + bi_d[myk] + idx;
1394                           end   = ba_d + bi_d[myk+1];
1395                         } else {
1396                           PetscInt idx = bdiag_d[myk+1]+1;
1397                           start = ba_d + idx;
1398                           startj= bj_d + idx;
1399                           end   = ba_d + bdiag_d[myk];
1400                         }
1401                         // search for 'col', use bisection search - TODO
1402                         low  = 0;
1403                         high = (PetscInt)(end-start);
1404                         while (high-low > 5) {
1405                           int t = (low+high)/2;
1406                           if (startj[t] > col) high = t;
1407                           else                 low  = t;
1408                         }
1409                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1410                           if (startj[pAkjv-start] == col) break;
1411                         }
1412 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1413                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1414 #endif
1415                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1416                       }
1417                     });
1418               }
1419             });
1420           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1421           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1422         } /* endof for (i=0; i<n; i++) { */
1423         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1424       });
1425     Kokkos::fence();
1426     Kokkos::deep_copy (h_flops_k, d_flops_k);
1427     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1428     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1429         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1430         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1431         /* Invert diagonal for simpler triangular solves */
1432         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1433             int i = start + outer_index*Ni + lg_rank%Ni;
1434             if (i < end) {
1435               PetscScalar *pv = ba_d + bdiag_d[i];
1436               *pv = 1.0/(*pv);
1437             }
1438           });
1439       });
1440   }
1441   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1442   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1443   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1444 
1445   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1446   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1447   if (b->inode.size) {
1448     B->ops->solve = MatSolve_SeqAIJ_Inode;
1449   } else if (row_identity && col_identity) {
1450     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1451   } else {
1452     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1453   }
1454   B->offloadmask = PETSC_OFFLOAD_GPU;
1455   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1456   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1457   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1458   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1459   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1460   B->assembled              = PETSC_TRUE;
1461   B->preallocated           = PETSC_TRUE;
1462 
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1467 {
1468   PetscErrorCode   ierr;
1469 
1470   PetscFunctionBegin;
1471   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1472   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1477 {
1478   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1479 
1480   PetscFunctionBegin;
1481   if (!factors->sptrsv_symbolic_completed) {
1482     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1483     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1484     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1485   }
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 /* Check if we need to update factors etc for transpose solve */
1490 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1491 {
1492   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1493   MatColIdxType             n = A->rmap->n;
1494 
1495   PetscFunctionBegin;
1496   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1497     /* Update L^T and do sptrsv symbolic */
1498     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1499     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1500     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1501     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1502 
1503     KokkosKernels::Impl::transpose_matrix<
1504       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1505       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1506       MatRowMapKokkosView,DefaultExecutionSpace>(
1507         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1508         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1509 
1510     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1511       We have to sort the indices, until KK provides finer control options.
1512     */
1513     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1514       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1515         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1516 
1517     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1518 
1519     /* Update U^T and do sptrsv symbolic */
1520     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1521     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1522     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1523     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1524 
1525     KokkosKernels::Impl::transpose_matrix<
1526       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1527       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1528       MatRowMapKokkosView,DefaultExecutionSpace>(
1529         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1530         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1531 
1532     /* Sort indices. See comments above */
1533     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1534       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1535         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1536 
1537     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1538     factors->transpose_updated = PETSC_TRUE;
1539   }
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 /* Solve Ax = b, with A = LU */
1544 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1545 {
1546   PetscErrorCode                 ierr;
1547   ConstPetscScalarKokkosView     bv;
1548   PetscScalarKokkosView          xv;
1549   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1550 
1551   PetscFunctionBegin;
1552   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1553   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1554   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1555   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1556   /* Solve L tmpv = b */
1557   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1558   /* Solve Ux = tmpv */
1559   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1560   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1561   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1562   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 /* Solve A^T x = b, where A^T = U^T L^T */
1567 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1568 {
1569   PetscErrorCode                 ierr;
1570   ConstPetscScalarKokkosView     bv;
1571   PetscScalarKokkosView          xv;
1572   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1573 
1574   PetscFunctionBegin;
1575   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1576   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1577   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1578   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1579   /* Solve U^T tmpv = b */
1580   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1581 
1582   /* Solve L^T x = tmpv */
1583   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1584   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1585   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1586   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1591 {
1592   PetscErrorCode                 ierr;
1593   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1594   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1595   PetscInt                       fill_lev = info->levels;
1596 
1597   PetscFunctionBegin;
1598   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1599   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1600 
1601   auto a_d = aijkok->a_dual.view_device();
1602   auto i_d = aijkok->i_dual.view_device();
1603   auto j_d = aijkok->j_dual.view_device();
1604 
1605   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1606 
1607   B->assembled                       = PETSC_TRUE;
1608   B->preallocated                    = PETSC_TRUE;
1609   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1610   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1611   B->ops->matsolve                   = NULL;
1612   B->ops->matsolvetranspose          = NULL;
1613   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1614 
1615   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1616   factors->transpose_updated         = PETSC_FALSE;
1617   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1618   /* TODO: log flops, but how to know that? */
1619   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1624 {
1625   PetscErrorCode                 ierr;
1626   Mat_SeqAIJKokkos               *aijkok;
1627   Mat_SeqAIJ                     *b;
1628   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1629   PetscInt                       fill_lev = info->levels;
1630   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1631   PetscInt                       n = A->rmap->n;
1632 
1633   PetscFunctionBegin;
1634   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1635   /* Rebuild factors */
1636   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1637   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1638 
1639   /* Create a spiluk handle and then do symbolic factorization */
1640   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1641   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1642 
1643   auto spiluk_handle = factors->kh.get_spiluk_handle();
1644 
1645   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1646   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1647   Kokkos::realloc(factors->iU_d,n+1);
1648   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1649 
1650   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1651   auto i_d = aijkok->i_dual.view_device();
1652   auto j_d = aijkok->j_dual.view_device();
1653   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1654   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1655 
1656   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1657   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1658   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1659   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1660 
1661   /* TODO: add options to select sptrsv algorithms */
1662   /* Create sptrsv handles for L, U and their transpose */
1663  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1664   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1665  #else
1666   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1667  #endif
1668 
1669   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1670   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1671   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1672   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1673 
1674   /* Fill fields of the factor matrix B */
1675   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1676   b     = (Mat_SeqAIJ*)B->data;
1677   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1678   B->info.fill_ratio_given  = info->fill;
1679   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1680 
1681   B->offloadmask            = PETSC_OFFLOAD_GPU;
1682   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1687 {
1688   PetscErrorCode   ierr;
1689   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1690   const PetscInt   nrows   = A->rmap->n;
1691 
1692   PetscFunctionBegin;
1693   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1694   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1695   // move B data into Kokkos
1696   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1697   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1698   {
1699     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1700     if (!baijkok->diag_d.extent(0)) {
1701       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1702       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1703       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1704     }
1705   }
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1710 {
1711   PetscFunctionBegin;
1712   *type = MATSOLVERKOKKOS;
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1717 {
1718   PetscFunctionBegin;
1719   *type = MATSOLVERKOKKOSDEVICE;
1720   PetscFunctionReturn(0);
1721 }
1722 
1723 /*MC
1724   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1725   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1726 
1727   Level: beginner
1728 
1729 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1730 M*/
1731 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1732 {
1733   PetscErrorCode ierr;
1734   PetscInt       n = A->rmap->n;
1735 
1736   PetscFunctionBegin;
1737   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1738   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1739   (*B)->factortype = ftype;
1740   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1741   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1742 
1743   if (ftype == MAT_FACTOR_LU) {
1744     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1745     (*B)->canuseordering         = PETSC_TRUE;
1746     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1747   } else if (ftype == MAT_FACTOR_ILU) {
1748     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1749     (*B)->canuseordering         = PETSC_FALSE;
1750     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1751   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1752 
1753   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1754   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1759 {
1760   PetscErrorCode ierr;
1761   PetscInt       n = A->rmap->n;
1762 
1763   PetscFunctionBegin;
1764   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1765   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1766   (*B)->factortype = ftype;
1767   (*B)->canuseordering = PETSC_TRUE;
1768   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1769   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1770 
1771   if (ftype == MAT_FACTOR_LU) {
1772     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1773     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1774   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1775 
1776   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1777   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1778   PetscFunctionReturn(0);
1779 }
1780 
1781 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1782 {
1783   PetscErrorCode ierr;
1784 
1785   PetscFunctionBegin;
1786   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1787   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1788   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 /* Utility to print out a KokkosCsrMatrix for debugging */
1793 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1794 {
1795   PetscErrorCode    ierr;
1796   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1797   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1798   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1799   const PetscInt    *i = iv.data();
1800   const PetscInt    *j = jv.data();
1801   const PetscScalar *a = av.data();
1802   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1803 
1804   PetscFunctionBegin;
1805   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1806   for (PetscInt k=0; k<m; k++) {
1807     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1808     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1809       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1810     }
1811     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1812   }
1813   PetscFunctionReturn(0);
1814 }
1815