xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision cfd33b4203be058f602faa5f0659dcc92f5200da)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
252   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
255   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   ierr   = PetscLogGpuTimeEnd();CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /* y = A^T x */
266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
267 {
268   PetscErrorCode                   ierr;
269   Mat_SeqAIJKokkos                 *aijkok;
270   const char                       *mode;
271   ConstPetscScalarKokkosView       xv;
272   PetscScalarKokkosView            yv;
273   KokkosCsrMatrix                  *csrmat;
274 
275   PetscFunctionBegin;
276   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
277   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
280   if (A->form_explicit_transpose) {
281     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282     mode = "N";
283   } else {
284     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285     csrmat = &aijkok->csrmat;
286     mode = "T";
287   }
288   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
291   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /* y = A^H x */
297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
298 {
299   PetscErrorCode                   ierr;
300   Mat_SeqAIJKokkos                 *aijkok;
301   const char                       *mode;
302   ConstPetscScalarKokkosView       xv;
303   PetscScalarKokkosView            yv;
304   KokkosCsrMatrix                  *csrmat;
305 
306   PetscFunctionBegin;
307   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
308   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
311   if (A->form_explicit_transpose) {
312     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313     mode = "N";
314   } else {
315     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316     csrmat = &aijkok->csrmat;
317     mode = "C";
318   }
319   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
322   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 /* z = A x + y */
328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
329 {
330   PetscErrorCode                   ierr;
331   Mat_SeqAIJKokkos                 *aijkok;
332   ConstPetscScalarKokkosView       xv,yv;
333   PetscScalarKokkosView            zv;
334 
335   PetscFunctionBegin;
336   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
337   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
341   if (zz != yy) Kokkos::deep_copy(zv,yv);
342   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 /* z = A^T x + y */
353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
354 {
355   PetscErrorCode                   ierr;
356   Mat_SeqAIJKokkos                 *aijkok;
357   const char                       *mode;
358   ConstPetscScalarKokkosView       xv,yv;
359   PetscScalarKokkosView            zv;
360   KokkosCsrMatrix                  *csrmat;
361 
362   PetscFunctionBegin;
363   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
364   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
368   if (zz != yy) Kokkos::deep_copy(zv,yv);
369   if (A->form_explicit_transpose) {
370     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374     csrmat = &aijkok->csrmat;
375     mode = "T";
376   }
377   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
381   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /* z = A^H x + y */
387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   PetscErrorCode                   ierr;
390   Mat_SeqAIJKokkos                 *aijkok;
391   const char                       *mode;
392   ConstPetscScalarKokkosView       xv,yv;
393   PetscScalarKokkosView            zv;
394   KokkosCsrMatrix                  *csrmat;
395 
396   PetscFunctionBegin;
397   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
398   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
402   if (zz != yy) Kokkos::deep_copy(zv,yv);
403   if (A->form_explicit_transpose) {
404     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode = "C";
410   }
411   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
415   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421 {
422   PetscErrorCode            ierr;
423   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424 
425   PetscFunctionBegin;
426   switch (op) {
427     case MAT_FORM_EXPLICIT_TRANSPOSE:
428       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430       A->form_explicit_transpose = flg;
431       break;
432     default:
433       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434       break;
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Depending on reuse, either build a new mat, or use the existing mat */
440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
441 {
442   PetscErrorCode   ierr;
443   Mat_SeqAIJ       *aseq;
444 
445   PetscFunctionBegin;
446   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
449   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
459       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
461     }
462   }
463   PetscFunctionReturn(0);
464 }
465 
466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468  */
469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
470 {
471   PetscErrorCode        ierr;
472   Mat_SeqAIJ            *bseq;
473   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474   Mat                   mat;
475 
476   PetscFunctionBegin;
477   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479   mat  = *B;
480   if (A->assembled) {
481     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484     /* Now copy values to B if needed */
485     if (dupOption == MAT_COPY_VALUES) {
486       if (akok->a_dual.need_sync_device()) {
487         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488         bkok->a_dual.modify_host();
489       } else { /* If device has the latest data, we only copy data on device */
490         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491         bkok->a_dual.modify_device();
492       }
493     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495       bkok->a_dual.modify_host();
496     }
497     mat->spptr = bkok;
498   }
499 
500   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
508 {
509   PetscErrorCode    ierr;
510   Mat               At;
511   KokkosCsrMatrix   *internT;
512   Mat_SeqAIJKokkos  *atkok,*bkok;
513 
514   PetscFunctionBegin;
515   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
516   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517     /* Deep copy internT, as we want to isolate the internal transpose */
518     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
519     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
520     if (reuse == MAT_INITIAL_MATRIX) *B = At;
521     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
522   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
523     if ((*B)->assembled) {
524       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
525       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
526       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
527     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
528       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
529       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
530       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
531       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
532       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
533     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
539 {
540   PetscErrorCode             ierr;
541   Mat_SeqAIJKokkos           *aijkok;
542 
543   PetscFunctionBegin;
544   if (A->factortype == MAT_FACTOR_NONE) {
545     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546     delete aijkok;
547   } else {
548     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
549   }
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
552   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
553   A->spptr = NULL;
554   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
559 {
560   PetscErrorCode ierr;
561 
562   PetscFunctionBegin;
563   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
564   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
565   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
570 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
571 {
572   PetscErrorCode               ierr;
573   Mat_SeqAIJ                   *a,*b;
574   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
575   MatScalarKokkosView          aa,ba,ca;
576   MatRowMapKokkosView          ai,bi,ci;
577   MatColIdxKokkosView          aj,bj,cj;
578   PetscInt                     m,n,nnz,aN;
579 
580   PetscFunctionBegin;
581   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
582   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
583   PetscValidPointer(C,4);
584   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
585   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
586   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
587   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
588 
589   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
590   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
591   a    = static_cast<Mat_SeqAIJ*>(A->data);
592   b    = static_cast<Mat_SeqAIJ*>(B->data);
593   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
594   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
595   aa   = akok->a_dual.view_device();
596   ai   = akok->i_dual.view_device();
597   ba   = bkok->a_dual.view_device();
598   bi   = bkok->i_dual.view_device();
599   m    = A->rmap->n; /* M, N and nnz of C */
600   n    = A->cmap->n + B->cmap->n;
601   nnz  = a->nz + b->nz;
602   aN   = A->cmap->n; /* N of A */
603   if (reuse == MAT_INITIAL_MATRIX) {
604     aj = akok->j_dual.view_device();
605     bj = bkok->j_dual.view_device();
606     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
607     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
608     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
609     ca = ca_dual.view_device();
610     ci = ci_dual.view_device();
611     cj = cj_dual.view_device();
612 
613     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
614     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
615       PetscInt i = t.league_rank(); /* row i */
616       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
617 
618       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
619         ci(i) = coffset;
620         if (i == m-1) ci(m) = ai(m) + bi(m);
621       });
622 
623       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
624         if (k < alen) {
625           ca(coffset+k) = aa(ai(i)+k);
626           cj(coffset+k) = aj(ai(i)+k);
627         } else {
628           ca(coffset+k) = ba(bi(i)+k-alen);
629           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
630         }
631       });
632     });
633     ca_dual.modify_device();
634     ci_dual.modify_device();
635     cj_dual.modify_device();
636     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
637     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
638   } else if (reuse == MAT_REUSE_MATRIX) {
639     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
640     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
641     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
642     ca   = ckok->a_dual.view_device();
643     ci   = ckok->i_dual.view_device();
644 
645     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
646       PetscInt i = t.league_rank(); /* row i */
647       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
648       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
649         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
650         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
651       });
652     });
653     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
659 {
660   PetscFunctionBegin;
661   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
666 {
667   PetscErrorCode                 ierr;
668   Mat_Product                    *product = C->product;
669   Mat                            A,B;
670   bool                           transA,transB; /* use bool, since KK needs this type */
671   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
672   Mat_SeqAIJ                     *c;
673   MatProductData_SeqAIJKokkos    *pdata;
674   KokkosCsrMatrix                *csrmatA,*csrmatB;
675 
676   PetscFunctionBegin;
677   MatCheckProduct(C,1);
678   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
679   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
680 
681   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
682     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
683     PetscFunctionReturn(0);
684   }
685 
686   switch (product->type) {
687     case MATPRODUCT_AB:  transA = false; transB = false; break;
688     case MATPRODUCT_AtB: transA = true;  transB = false; break;
689     case MATPRODUCT_ABt: transA = false; transB = true;  break;
690     default:
691       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
692   }
693 
694   A     = product->A;
695   B     = product->B;
696   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
697   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
698   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
699   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
700   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
701 
702   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
703 
704   csrmatA = &akok->csrmat;
705   csrmatB = &bkok->csrmat;
706 
707   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
708   if (transA) {
709     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
710     transA = false;
711   }
712 
713   if (transB) {
714     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
715     transB = false;
716   }
717   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
718   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
719   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
720   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
721   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
722   /* shorter version of MatAssemblyEnd_SeqAIJ */
723   c = (Mat_SeqAIJ*)C->data;
724   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
725   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
726   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
727   c->reallocs         = 0;
728   C->info.mallocs     = 0;
729   C->info.nz_unneeded = 0;
730   C->assembled        = C->was_assembled = PETSC_TRUE;
731   C->num_ass++;
732   PetscFunctionReturn(0);
733 }
734 
735 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
736 {
737   PetscErrorCode                 ierr;
738   Mat_Product                    *product = C->product;
739   MatProductType                 ptype;
740   Mat                            A,B;
741   bool                           transA,transB;
742   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
743   MatProductData_SeqAIJKokkos    *pdata;
744   MPI_Comm                       comm;
745   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
746 
747   PetscFunctionBegin;
748   MatCheckProduct(C,1);
749   ierr = PetscObjectGetComm((PetscObject)C,&comm);
750   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
751   A       = product->A;
752   B       = product->B;
753   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
754   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
755   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
756   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
757   csrmatA = &akok->csrmat;
758   csrmatB = &bkok->csrmat;
759 
760   ptype   = product->type;
761   switch (ptype) {
762     case MATPRODUCT_AB:  transA = false; transB = false; break;
763     case MATPRODUCT_AtB: transA = true;  transB = false; break;
764     case MATPRODUCT_ABt: transA = false; transB = true;  break;
765     default:
766       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
767   }
768 
769   product->data = pdata = new MatProductData_SeqAIJKokkos();
770   pdata->kh.set_team_work_size(16);
771   pdata->kh.set_dynamic_scheduling(true);
772   pdata->reusesym = product->api_user;
773 
774   /* TODO: add command line options to select spgemm algorithms */
775   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
776   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
777      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
778    */
779   pdata->kh.create_spgemm_handle(spgemm_alg);
780 
781   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
782   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
783   if (transA) {
784     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
785     transA = false;
786   }
787 
788   if (transB) {
789     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
790     transB = false;
791   }
792 
793   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
794   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
795     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
796     calling new Mat_SeqAIJKokkos().
797     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
798   */
799   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
800   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
801   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
802 
803   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
804   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
805   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
806   PetscFunctionReturn(0);
807 }
808 
809 /* handles sparse matrix matrix ops */
810 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
811 {
812   PetscErrorCode ierr;
813   Mat_Product    *product = mat->product;
814   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
815 
816   PetscFunctionBegin;
817   MatCheckProduct(mat,1);
818   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
819   if (product->type == MATPRODUCT_ABC) {
820     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
821   }
822   if (Biskok && Ciskok) {
823     switch (product->type) {
824     case MATPRODUCT_AB:
825     case MATPRODUCT_AtB:
826     case MATPRODUCT_ABt:
827       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
828       break;
829     case MATPRODUCT_PtAP:
830     case MATPRODUCT_RARt:
831     case MATPRODUCT_ABC:
832       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
833       break;
834     default:
835       break;
836     }
837   } else { /* fallback for AIJ */
838     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
839   }
840   PetscFunctionReturn(0);
841 }
842 
843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
844 {
845   PetscErrorCode   ierr;
846   Mat_SeqAIJKokkos *aijkok;
847 
848   PetscFunctionBegin;
849   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
850   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
851   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
852   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
853   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
854   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
855   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 
859 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
860 {
861   PetscErrorCode   ierr;
862   Mat_SeqAIJKokkos *aijkok;
863 
864   PetscFunctionBegin;
865   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
866   if (aijkok) { /* Only zero the device if data is already there */
867     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
868     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
869   } else { /* Might be preallocated but not assembled */
870     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
871   }
872   PetscFunctionReturn(0);
873 }
874 
875 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
876 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
877 {
878   PetscErrorCode     ierr;
879   Mat_SeqAIJKokkos   *aijkok;
880 
881   PetscFunctionBegin;
882   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
883   PetscValidPointer(kv,2);
884   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
885   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
886   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887   *kv    = aijkok->a_dual.view_device();
888   PetscFunctionReturn(0);
889 }
890 
891 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
892 {
893   PetscFunctionBegin;
894   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
895   PetscValidPointer(kv,2);
896   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
901 {
902   PetscErrorCode     ierr;
903   Mat_SeqAIJKokkos   *aijkok;
904 
905   PetscFunctionBegin;
906   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
907   PetscValidPointer(kv,2);
908   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
909   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
910   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
911   *kv    = aijkok->a_dual.view_device();
912   PetscFunctionReturn(0);
913 }
914 
915 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
916 {
917   PetscErrorCode     ierr;
918 
919   PetscFunctionBegin;
920   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
921   PetscValidPointer(kv,2);
922   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
923   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
928 {
929   Mat_SeqAIJKokkos   *aijkok;
930 
931   PetscFunctionBegin;
932   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
933   PetscValidPointer(kv,2);
934   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
935   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
936   *kv    = aijkok->a_dual.view_device();
937   PetscFunctionReturn(0);
938 }
939 
940 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
941 {
942   PetscErrorCode     ierr;
943 
944   PetscFunctionBegin;
945   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
946   PetscValidPointer(kv,2);
947   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
948   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
949   PetscFunctionReturn(0);
950 }
951 
952 /* Computes Y += alpha X */
953 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
954 {
955   PetscErrorCode             ierr;
956   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
957   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
958   ConstMatScalarKokkosView   Xa;
959   MatScalarKokkosView        Ya;
960 
961   PetscFunctionBegin;
962   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
963   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
964   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
965   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
966   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
967 
968   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
969     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
970     PetscBool e;
971     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
972     if (e) {
973       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
974       if (e) pattern = SAME_NONZERO_PATTERN;
975     }
976   }
977 
978   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
979     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
980     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
981     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
982   */
983   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
984   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
985   Xa   = xkok->a_dual.view_device();
986   Ya   = ykok->a_dual.view_device();
987 
988   if (pattern == SAME_NONZERO_PATTERN) {
989     KokkosBlas::axpy(alpha,Xa,Ya);
990     ierr = MatSeqAIJKokkosModifyDevice(Y);
991   } else if (pattern == SUBSET_NONZERO_PATTERN) {
992     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
993     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
994 
995     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
996       PetscInt i = t.league_rank(); /* row i */
997       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
998         PetscInt p,q = Yi(i);
999         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1000           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1001           if (Xj(p) == Yj(q)) { /* Found it */
1002             Ya(q) += alpha * Xa(p);
1003             q++;
1004           } else {
1005             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1006                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1007             */
1008             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1009           }
1010         }
1011       });
1012     });
1013     ierr = MatSeqAIJKokkosModifyDevice(Y);
1014   } else { /* different nonzero patterns */
1015     Mat             Z;
1016     KokkosCsrMatrix zcsr;
1017     KernelHandle    kh;
1018     kh.create_spadd_handle(false);
1019     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1020     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1021     zkok = new Mat_SeqAIJKokkos(zcsr);
1022     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1023     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1024     kh.destroy_spadd_handle();
1025   }
1026   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1027   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1032 {
1033   PetscErrorCode            ierr;
1034   Mat                       newmat;
1035   Mat_SeqAIJKokkos          *akok;
1036   Mat_SeqAIJ                *aseq;
1037 
1038   PetscFunctionBegin;
1039   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1040   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1041   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1042   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1043   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1044   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1045   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1046   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1047   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1048   akok->SetUpCOO(aseq);
1049   PetscFunctionReturn(0);
1050 }
1051 
1052 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1053 {
1054   PetscErrorCode              ierr;
1055   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1056   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1057   PetscCount                  Annz = aseq->nz;
1058   const PetscCountKokkosView& jmap = akok->jmap_d;
1059   const PetscCountKokkosView& perm = akok->perm_d;
1060   MatScalarKokkosView         Aa;
1061   ConstMatScalarKokkosView    kv;
1062   PetscMemType                memtype;
1063 
1064   PetscFunctionBegin;
1065   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
1066   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1067   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1068     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1069   } else {
1070     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1071   }
1072 
1073   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1074   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
1075 
1076   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1077     PetscScalar sum = 0.0;
1078     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1079     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1080   });
1081 
1082   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1083   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1088 {
1089   PetscErrorCode ierr;
1090 
1091   PetscFunctionBegin;
1092   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1093   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1094   B->offloadmask = PETSC_OFFLOAD_CPU;
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1099 {
1100   PetscErrorCode     ierr;
1101   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1102 
1103   PetscFunctionBegin;
1104   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1105   A->boundtocpu  = PETSC_FALSE;
1106 
1107   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1108   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1109   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1110   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1111   A->ops->scale                     = MatScale_SeqAIJKokkos;
1112   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1113   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1114   A->ops->mult                      = MatMult_SeqAIJKokkos;
1115   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1116   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1117   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1118   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1119   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1120   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1121   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1122   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1123   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1124   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1125   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1126   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1127   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1128   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1129 
1130   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1131   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1132   PetscFunctionReturn(0);
1133 }
1134 
1135 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1136 {
1137   PetscErrorCode ierr;
1138   Mat_SeqAIJ     *aseq;
1139   PetscInt       i,m,n;
1140 
1141   PetscFunctionBegin;
1142   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1143 
1144   m    = akok->nrows();
1145   n    = akok->ncols();
1146   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1147   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1148 
1149   /* Set up data structures of A as a MATSEQAIJ */
1150   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1151   aseq = (Mat_SeqAIJ*)(A)->data;
1152 
1153   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1154   akok->j_dual.sync_host();
1155 
1156   aseq->i            = akok->i_host_data();
1157   aseq->j            = akok->j_host_data();
1158   aseq->a            = akok->a_host_data();
1159   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1160   aseq->singlemalloc = PETSC_FALSE;
1161   aseq->free_a       = PETSC_FALSE;
1162   aseq->free_ij      = PETSC_FALSE;
1163   aseq->nz           = akok->nnz();
1164   aseq->maxnz        = aseq->nz;
1165 
1166   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1167   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1168   for (i=0; i<m; i++) {
1169     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1170   }
1171 
1172   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1173   akok->nonzerostate = A->nonzerostate;
1174   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1175   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1176   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1181 
1182    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1183  */
1184 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1185 {
1186   PetscErrorCode ierr;
1187 
1188   PetscFunctionBegin;
1189   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1190   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1191   PetscFunctionReturn(0);
1192 }
1193 
1194 /* --------------------------------------------------------------------------------*/
1195 /*@C
1196    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1197    (the default parallel PETSc format). This matrix will ultimately be handled by
1198    Kokkos for calculations. For good matrix
1199    assembly performance the user should preallocate the matrix storage by setting
1200    the parameter nz (or the array nnz).  By setting these parameters accurately,
1201    performance during matrix assembly can be increased by more than a factor of 50.
1202 
1203    Collective
1204 
1205    Input Parameters:
1206 +  comm - MPI communicator, set to PETSC_COMM_SELF
1207 .  m - number of rows
1208 .  n - number of columns
1209 .  nz - number of nonzeros per row (same for all rows)
1210 -  nnz - array containing the number of nonzeros in the various rows
1211          (possibly different for each row) or NULL
1212 
1213    Output Parameter:
1214 .  A - the matrix
1215 
1216    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1217    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1218    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1219 
1220    Notes:
1221    If nnz is given then nz is ignored
1222 
1223    The AIJ format (also called the Yale sparse matrix format or
1224    compressed row storage), is fully compatible with standard Fortran 77
1225    storage.  That is, the stored row and column indices can begin at
1226    either one (as in Fortran) or zero.  See the users' manual for details.
1227 
1228    Specify the preallocated storage with either nz or nnz (not both).
1229    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1230    allocation.  For large problems you MUST preallocate memory or you
1231    will get TERRIBLE performance, see the users' manual chapter on matrices.
1232 
1233    By default, this format uses inodes (identical nodes) when possible, to
1234    improve numerical efficiency of matrix-vector products and solves. We
1235    search for consecutive rows with the same nonzero structure, thereby
1236    reusing matrix information to achieve increased efficiency.
1237 
1238    Level: intermediate
1239 
1240 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1241 @*/
1242 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1243 {
1244   PetscErrorCode ierr;
1245 
1246   PetscFunctionBegin;
1247   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1248   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1249   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1250   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1251   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 typedef Kokkos::TeamPolicy<>::member_type team_member;
1256 //
1257 // This factorization exploits block diagonal matrices with "Nf" (not used).
1258 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1259 //
1260 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1261 {
1262   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1263   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1264   IS                 isrow = b->row,isicol = b->icol;
1265   PetscErrorCode     ierr;
1266   const PetscInt     *r_h,*ic_h;
1267   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1268   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1269   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1270   PetscBool          row_identity,col_identity;
1271   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1272 
1273   PetscFunctionBegin;
1274   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1275   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1276   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1277   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1278   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1279   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1280   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1281   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1282   {
1283 #define KOKKOS_SHARED_LEVEL 1
1284     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1285     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1286     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1287     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1288     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1289     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1290     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1291     size_t flops_h = 0.0;
1292     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1293     Kokkos::View<size_t> d_flops_k ("flops");
1294     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1295     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1296     Kokkos::deep_copy (d_flops_k, h_flops_k);
1297     Kokkos::deep_copy (d_r_k, h_r_k);
1298     Kokkos::deep_copy (d_ic_k, h_ic_k);
1299     // Fill A --> fact
1300     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1301         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1302         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1303         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1304         // zero rows of B
1305         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1306             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1307             PetscScalar *baL = ba_d + bi_d[rowb];
1308             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1309             /* zero (unfactored row) */
1310             for (int j=0;j<nzbL;j++) baL[j] = 0;
1311             for (int j=0;j<nzbU;j++) baU[j] = 0;
1312           });
1313         // copy A into B
1314         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1315             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1316             const PetscScalar *av    = aa_d + ai_d[rowa];
1317             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1318             /* load in initial (unfactored row) */
1319             for (int j=0;j<nza;j++) {
1320               PetscInt    colb = ic[ajtmp[j]];
1321               PetscScalar vala = av[j];
1322               if (colb == rowb) {
1323                 *(ba_d + bdiag_d[rowb]) = vala;
1324               } else {
1325                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1326                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1327                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1328                 for (int j=0; j<nz ; j++) {
1329                   if (pbj[j] == colb) {
1330                     pba[j] = vala;
1331                     set++;
1332                     break;
1333                   }
1334                 }
1335                #if !defined(PETSC_HAVE_SYCL)
1336                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1337                #endif
1338               }
1339             }
1340           });
1341       });
1342     Kokkos::fence();
1343 
1344     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1345         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1346         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1347         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1348         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1349         const PetscInt  start = field*nloc, end = start + nloc;
1350         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1351         // A22 panel update for each row A(1,:) and col A(:,1)
1352         for (int ii=start; ii<end-1; ii++) {
1353           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1354           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1355           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1356           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1357           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1358               PetscInt kIdx = j*Ni + field_block_idx;
1359               if (kIdx >= nzUi) /* void */ ;
1360               else {
1361                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1362                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1363                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1364                 size_t         st_idx;
1365                 // find and do L(k,i) = A(:k,i) / A(i,i)
1366                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1367                 // get column, there has got to be a better way
1368                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1369                     if (pjL[j] == ii) {
1370                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1371                       idx = j; // output
1372                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1373                     }
1374                 }, st_idx);
1375                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1376 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1377                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1378 #endif
1379                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1380                 // U(i+1,:end)
1381                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1382                       PetscScalar Uij = baUi[uiIdx];
1383                       PetscInt    col = bjUi[uiIdx];
1384                       if (col==myk) {
1385                         // A_kk = A_kk - L_ki * U_ij(k)
1386                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1387                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1388                       } else {
1389                         PetscScalar    *start, *end, *pAkjv=NULL;
1390                         PetscInt       high, low;
1391                         const PetscInt *startj;
1392                         if (col<myk) { // L
1393                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1394                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1395                           start = pLki+1; // start at pLki+1, A22(myk,1)
1396                           startj= bj_d + bi_d[myk] + idx;
1397                           end   = ba_d + bi_d[myk+1];
1398                         } else {
1399                           PetscInt idx = bdiag_d[myk+1]+1;
1400                           start = ba_d + idx;
1401                           startj= bj_d + idx;
1402                           end   = ba_d + bdiag_d[myk];
1403                         }
1404                         // search for 'col', use bisection search - TODO
1405                         low  = 0;
1406                         high = (PetscInt)(end-start);
1407                         while (high-low > 5) {
1408                           int t = (low+high)/2;
1409                           if (startj[t] > col) high = t;
1410                           else                 low  = t;
1411                         }
1412                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1413                           if (startj[pAkjv-start] == col) break;
1414                         }
1415 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1416                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1417 #endif
1418                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1419                       }
1420                     });
1421               }
1422             });
1423           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1424           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1425         } /* endof for (i=0; i<n; i++) { */
1426         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1427       });
1428     Kokkos::fence();
1429     Kokkos::deep_copy (h_flops_k, d_flops_k);
1430     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1431     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1432         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1433         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1434         /* Invert diagonal for simpler triangular solves */
1435         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1436             int i = start + outer_index*Ni + lg_rank%Ni;
1437             if (i < end) {
1438               PetscScalar *pv = ba_d + bdiag_d[i];
1439               *pv = 1.0/(*pv);
1440             }
1441           });
1442       });
1443   }
1444   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1445   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1446   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1447 
1448   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1449   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1450   if (b->inode.size) {
1451     B->ops->solve = MatSolve_SeqAIJ_Inode;
1452   } else if (row_identity && col_identity) {
1453     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1454   } else {
1455     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1456   }
1457   B->offloadmask = PETSC_OFFLOAD_GPU;
1458   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1459   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1460   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1461   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1462   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1463   B->assembled              = PETSC_TRUE;
1464   B->preallocated           = PETSC_TRUE;
1465 
1466   PetscFunctionReturn(0);
1467 }
1468 
1469 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1470 {
1471   PetscErrorCode   ierr;
1472 
1473   PetscFunctionBegin;
1474   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1475   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1480 {
1481   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1482 
1483   PetscFunctionBegin;
1484   if (!factors->sptrsv_symbolic_completed) {
1485     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1486     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1487     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1488   }
1489   PetscFunctionReturn(0);
1490 }
1491 
1492 /* Check if we need to update factors etc for transpose solve */
1493 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1494 {
1495   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1496   MatColIdxType             n = A->rmap->n;
1497 
1498   PetscFunctionBegin;
1499   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1500     /* Update L^T and do sptrsv symbolic */
1501     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1502     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1503     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1504     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1505 
1506     KokkosKernels::Impl::transpose_matrix<
1507       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1508       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1509       MatRowMapKokkosView,DefaultExecutionSpace>(
1510         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1511         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1512 
1513     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1514       We have to sort the indices, until KK provides finer control options.
1515     */
1516     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1517       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1518         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1519 
1520     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1521 
1522     /* Update U^T and do sptrsv symbolic */
1523     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1524     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1525     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1526     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1527 
1528     KokkosKernels::Impl::transpose_matrix<
1529       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1530       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1531       MatRowMapKokkosView,DefaultExecutionSpace>(
1532         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1533         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1534 
1535     /* Sort indices. See comments above */
1536     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1537       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1538         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1539 
1540     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1541     factors->transpose_updated = PETSC_TRUE;
1542   }
1543   PetscFunctionReturn(0);
1544 }
1545 
1546 /* Solve Ax = b, with A = LU */
1547 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1548 {
1549   PetscErrorCode                 ierr;
1550   ConstPetscScalarKokkosView     bv;
1551   PetscScalarKokkosView          xv;
1552   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1553 
1554   PetscFunctionBegin;
1555   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1556   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1557   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1558   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1559   /* Solve L tmpv = b */
1560   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1561   /* Solve Ux = tmpv */
1562   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1563   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1564   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1565   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 /* Solve A^T x = b, where A^T = U^T L^T */
1570 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1571 {
1572   PetscErrorCode                 ierr;
1573   ConstPetscScalarKokkosView     bv;
1574   PetscScalarKokkosView          xv;
1575   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1576 
1577   PetscFunctionBegin;
1578   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1579   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1580   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1581   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1582   /* Solve U^T tmpv = b */
1583   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1584 
1585   /* Solve L^T x = tmpv */
1586   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1587   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1588   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1589   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1594 {
1595   PetscErrorCode                 ierr;
1596   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1597   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1598   PetscInt                       fill_lev = info->levels;
1599 
1600   PetscFunctionBegin;
1601   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1602   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1603 
1604   auto a_d = aijkok->a_dual.view_device();
1605   auto i_d = aijkok->i_dual.view_device();
1606   auto j_d = aijkok->j_dual.view_device();
1607 
1608   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1609 
1610   B->assembled                       = PETSC_TRUE;
1611   B->preallocated                    = PETSC_TRUE;
1612   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1613   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1614   B->ops->matsolve                   = NULL;
1615   B->ops->matsolvetranspose          = NULL;
1616   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1617 
1618   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1619   factors->transpose_updated         = PETSC_FALSE;
1620   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1621   /* TODO: log flops, but how to know that? */
1622   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1627 {
1628   PetscErrorCode                 ierr;
1629   Mat_SeqAIJKokkos               *aijkok;
1630   Mat_SeqAIJ                     *b;
1631   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1632   PetscInt                       fill_lev = info->levels;
1633   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1634   PetscInt                       n = A->rmap->n;
1635 
1636   PetscFunctionBegin;
1637   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1638   /* Rebuild factors */
1639   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1640   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1641 
1642   /* Create a spiluk handle and then do symbolic factorization */
1643   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1644   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1645 
1646   auto spiluk_handle = factors->kh.get_spiluk_handle();
1647 
1648   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1649   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1650   Kokkos::realloc(factors->iU_d,n+1);
1651   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1652 
1653   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1654   auto i_d = aijkok->i_dual.view_device();
1655   auto j_d = aijkok->j_dual.view_device();
1656   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1657   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1658 
1659   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1660   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1661   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1662   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1663 
1664   /* TODO: add options to select sptrsv algorithms */
1665   /* Create sptrsv handles for L, U and their transpose */
1666  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1667   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1668  #else
1669   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1670  #endif
1671 
1672   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1673   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1674   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1675   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1676 
1677   /* Fill fields of the factor matrix B */
1678   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1679   b     = (Mat_SeqAIJ*)B->data;
1680   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1681   B->info.fill_ratio_given  = info->fill;
1682   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1683 
1684   B->offloadmask            = PETSC_OFFLOAD_GPU;
1685   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1686   PetscFunctionReturn(0);
1687 }
1688 
1689 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1690 {
1691   PetscErrorCode   ierr;
1692   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1693   const PetscInt   nrows   = A->rmap->n;
1694 
1695   PetscFunctionBegin;
1696   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1697   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1698   // move B data into Kokkos
1699   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1700   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1701   {
1702     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1703     if (!baijkok->diag_d.extent(0)) {
1704       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1705       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1706       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1707     }
1708   }
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1713 {
1714   PetscFunctionBegin;
1715   *type = MATSOLVERKOKKOS;
1716   PetscFunctionReturn(0);
1717 }
1718 
1719 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1720 {
1721   PetscFunctionBegin;
1722   *type = MATSOLVERKOKKOSDEVICE;
1723   PetscFunctionReturn(0);
1724 }
1725 
1726 /*MC
1727   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1728   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1729 
1730   Level: beginner
1731 
1732 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1733 M*/
1734 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1735 {
1736   PetscErrorCode ierr;
1737   PetscInt       n = A->rmap->n;
1738 
1739   PetscFunctionBegin;
1740   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1741   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1742   (*B)->factortype = ftype;
1743   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1744   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1745 
1746   if (ftype == MAT_FACTOR_LU) {
1747     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1748     (*B)->canuseordering         = PETSC_TRUE;
1749     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1750   } else if (ftype == MAT_FACTOR_ILU) {
1751     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1752     (*B)->canuseordering         = PETSC_FALSE;
1753     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1754   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1755 
1756   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1757   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1758   PetscFunctionReturn(0);
1759 }
1760 
1761 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1762 {
1763   PetscErrorCode ierr;
1764   PetscInt       n = A->rmap->n;
1765 
1766   PetscFunctionBegin;
1767   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1768   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1769   (*B)->factortype = ftype;
1770   (*B)->canuseordering = PETSC_TRUE;
1771   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1772   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1773 
1774   if (ftype == MAT_FACTOR_LU) {
1775     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1776     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1777   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1778 
1779   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1780   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1781   PetscFunctionReturn(0);
1782 }
1783 
1784 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1785 {
1786   PetscErrorCode ierr;
1787 
1788   PetscFunctionBegin;
1789   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1790   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1791   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 /* Utility to print out a KokkosCsrMatrix for debugging */
1796 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1797 {
1798   PetscErrorCode    ierr;
1799   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1800   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1801   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1802   const PetscInt    *i = iv.data();
1803   const PetscInt    *j = jv.data();
1804   const PetscScalar *a = av.data();
1805   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1806 
1807   PetscFunctionBegin;
1808   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1809   for (PetscInt k=0; k<m; k++) {
1810     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1811     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1812       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1813     }
1814     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1815   }
1816   PetscFunctionReturn(0);
1817 }
1818