xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision b45e3bf4ff73d80a20c3202b6cd9f79d2f2d3efe)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
63   if (aijkok->a_dual.need_sync_device()) {
64     aijkok->a_dual.sync_device();
65     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
66     aijkok->hermitian_updated = PETSC_FALSE;
67   }
68   PetscFunctionReturn(0);
69 }
70 
71 /* Mark the CSR data on device as modified */
72 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
73 {
74   PetscErrorCode   ierr;
75   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
76 
77   PetscFunctionBegin;
78   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
79   aijkok->a_dual.clear_sync_state();
80   aijkok->a_dual.modify_device();
81   aijkok->transpose_updated = PETSC_FALSE;
82   aijkok->hermitian_updated = PETSC_FALSE;
83   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
84   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
85   PetscFunctionReturn(0);
86 }
87 
88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89 {
90   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91 
92   PetscFunctionBegin;
93   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
94   /* We do not expect one needs factors on host  */
95   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97   aijkok->a_dual.sync_host();
98   PetscFunctionReturn(0);
99 }
100 
101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102 {
103   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104 
105   PetscFunctionBegin;
106   if (aijkok) {
107     aijkok->a_dual.sync_host();
108     *array = aijkok->a_dual.view_host().data();
109   } else { /* Happens when calling MatSetValues on a newly created matrix */
110     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111   }
112   PetscFunctionReturn(0);
113 }
114 
115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116 {
117   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118 
119   PetscFunctionBegin;
120   if (aijkok) aijkok->a_dual.modify_host();
121   PetscFunctionReturn(0);
122 }
123 
124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127 
128   PetscFunctionBegin;
129   if (aijkok) {
130     aijkok->a_dual.sync_host();
131     *array = aijkok->a_dual.view_host().data();
132   } else {
133     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
134   }
135   PetscFunctionReturn(0);
136 }
137 
138 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
139 {
140   PetscFunctionBegin;
141   *array = NULL;
142   PetscFunctionReturn(0);
143 }
144 
145 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
146 {
147   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
148 
149   PetscFunctionBegin;
150   if (aijkok) {
151     *array = aijkok->a_dual.view_host().data();
152   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
153     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
159 {
160   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
161 
162   PetscFunctionBegin;
163   if (aijkok) {
164     aijkok->a_dual.clear_sync_state();
165     aijkok->a_dual.modify_host();
166   }
167   PetscFunctionReturn(0);
168 }
169 
170 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
171 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
172 {
173   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
174   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
175 
176   PetscFunctionBegin;
177   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
178   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
179   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
180   PetscFunctionReturn(0);
181 }
182 
183 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
184 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
185 {
186   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
187 
188   PetscFunctionBegin;
189   if (aijkok && aijkok->device_mat_d.data()) {
190     *d_mat = aijkok->device_mat_d.data();
191   } else {
192     PetscErrorCode   ierr;
193     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
194     *d_mat  = NULL;
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 /* Generate the transpose on device and cache it internally */
200 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
201 {
202   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
203 
204   PetscFunctionBegin;
205   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
206   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
207     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
208     CHKERRCXX(aijkok->a_dual.sync_device());
209     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
210     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
211     aijkok->transpose_updated = PETSC_TRUE;
212   }
213   *csrmatT = &aijkok->csrmatT;
214   PetscFunctionReturn(0);
215 }
216 
217 /* Generate the Hermitian on device and cache it internally */
218 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
219 {
220   PetscErrorCode                   ierr;
221   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222 
223   PetscFunctionBegin;
224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
225   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
226   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
227     CHKERRCXX(aijkok->a_dual.sync_device());
228     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
230    #if defined(PETSC_USE_COMPLEX)
231     const auto& a = aijkok->csrmatH.values;
232     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
233    #endif
234     aijkok->hermitian_updated = PETSC_TRUE;
235   }
236   *csrmatH = &aijkok->csrmatH;
237   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 /* y = A x */
242 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
243 {
244   PetscErrorCode                   ierr;
245   Mat_SeqAIJKokkos                 *aijkok;
246   ConstPetscScalarKokkosView       xv;
247   PetscScalarKokkosView            yv;
248 
249   PetscFunctionBegin;
250   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
251   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
254   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
256   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
257   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
258   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
259   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
260   ierr   = PetscLogGpuTimeEnd();CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* y = A^T x */
265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
266 {
267   PetscErrorCode                   ierr;
268   Mat_SeqAIJKokkos                 *aijkok;
269   const char                       *mode;
270   ConstPetscScalarKokkosView       xv;
271   PetscScalarKokkosView            yv;
272   KokkosCsrMatrix                  *csrmat;
273 
274   PetscFunctionBegin;
275   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
276   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
279   if (A->form_explicit_transpose) {
280     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281     mode = "N";
282   } else {
283     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284     csrmat = &aijkok->csrmat;
285     mode = "T";
286   }
287   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
290   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /* y = A^H x */
296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
297 {
298   PetscErrorCode                   ierr;
299   Mat_SeqAIJKokkos                 *aijkok;
300   const char                       *mode;
301   ConstPetscScalarKokkosView       xv;
302   PetscScalarKokkosView            yv;
303   KokkosCsrMatrix                  *csrmat;
304 
305   PetscFunctionBegin;
306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
307   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
310   if (A->form_explicit_transpose) {
311     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312     mode = "N";
313   } else {
314     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315     csrmat = &aijkok->csrmat;
316     mode = "C";
317   }
318   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
321   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 /* z = A x + y */
327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
328 {
329   PetscErrorCode                   ierr;
330   Mat_SeqAIJKokkos                 *aijkok;
331   ConstPetscScalarKokkosView       xv,yv;
332   PetscScalarKokkosView            zv;
333 
334   PetscFunctionBegin;
335   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
336   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
340   if (zz != yy) Kokkos::deep_copy(zv,yv);
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
346   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A^T x + y */
352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
353 {
354   PetscErrorCode                   ierr;
355   Mat_SeqAIJKokkos                 *aijkok;
356   const char                       *mode;
357   ConstPetscScalarKokkosView       xv,yv;
358   PetscScalarKokkosView            zv;
359   KokkosCsrMatrix                  *csrmat;
360 
361   PetscFunctionBegin;
362   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
363   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
367   if (zz != yy) Kokkos::deep_copy(zv,yv);
368   if (A->form_explicit_transpose) {
369     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370     mode = "N";
371   } else {
372     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373     csrmat = &aijkok->csrmat;
374     mode = "T";
375   }
376   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
380   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 /* z = A^H x + y */
386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
387 {
388   PetscErrorCode                   ierr;
389   Mat_SeqAIJKokkos                 *aijkok;
390   const char                       *mode;
391   ConstPetscScalarKokkosView       xv,yv;
392   PetscScalarKokkosView            zv;
393   KokkosCsrMatrix                  *csrmat;
394 
395   PetscFunctionBegin;
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
401   if (zz != yy) Kokkos::deep_copy(zv,yv);
402   if (A->form_explicit_transpose) {
403     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407     csrmat = &aijkok->csrmat;
408     mode = "C";
409   }
410   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
414   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420 {
421   PetscErrorCode            ierr;
422   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423 
424   PetscFunctionBegin;
425   switch (op) {
426     case MAT_FORM_EXPLICIT_TRANSPOSE:
427       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429       A->form_explicit_transpose = flg;
430       break;
431     default:
432       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433       break;
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 /* Depending on reuse, either build a new mat, or use the existing mat */
439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
440 {
441   PetscErrorCode   ierr;
442   Mat_SeqAIJ       *aseq;
443 
444   PetscFunctionBegin;
445   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
448   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
458       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
460     }
461   }
462   PetscFunctionReturn(0);
463 }
464 
465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467  */
468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
469 {
470   PetscErrorCode        ierr;
471   Mat_SeqAIJ            *bseq;
472   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473   Mat                   mat;
474 
475   PetscFunctionBegin;
476   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478   mat  = *B;
479   if (A->assembled) {
480     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483     /* Now copy values to B if needed */
484     if (dupOption == MAT_COPY_VALUES) {
485       if (akok->a_dual.need_sync_device()) {
486         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487         bkok->a_dual.modify_host();
488       } else { /* If device has the latest data, we only copy data on device */
489         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490         bkok->a_dual.modify_device();
491       }
492     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494       bkok->a_dual.modify_host();
495     }
496     mat->spptr = bkok;
497   }
498 
499   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
507 {
508   PetscErrorCode    ierr;
509   Mat               At;
510   KokkosCsrMatrix   *internT;
511   Mat_SeqAIJKokkos  *atkok,*bkok;
512 
513   PetscFunctionBegin;
514   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
515   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516     /* Deep copy internT, as we want to isolate the internal transpose */
517     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
518     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
519     if (reuse == MAT_INITIAL_MATRIX) *B = At;
520     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
521   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
522     if ((*B)->assembled) {
523       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
524       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
525       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
526     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
527       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
528       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
529       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
530       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
531       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
532     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
538 {
539   PetscErrorCode             ierr;
540   Mat_SeqAIJKokkos           *aijkok;
541 
542   PetscFunctionBegin;
543   if (A->factortype == MAT_FACTOR_NONE) {
544     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545     delete aijkok;
546   } else {
547     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
548   }
549   A->spptr = NULL;
550   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
551   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
552   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
553   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 /*MC
558    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
559 
560    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
561 
562    Options Database Keys:
563 .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
564 
565   Level: beginner
566 
567 .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
568 M*/
569 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
570 {
571   PetscErrorCode ierr;
572 
573   PetscFunctionBegin;
574   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
575   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
576   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 
580 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582 {
583   PetscErrorCode               ierr;
584   Mat_SeqAIJ                   *a,*b;
585   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
586   MatScalarKokkosView          aa,ba,ca;
587   MatRowMapKokkosView          ai,bi,ci;
588   MatColIdxKokkosView          aj,bj,cj;
589   PetscInt                     m,n,nnz,aN;
590 
591   PetscFunctionBegin;
592   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
593   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
594   PetscValidPointer(C,4);
595   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
596   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
597   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
598   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
599 
600   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
601   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
602   a    = static_cast<Mat_SeqAIJ*>(A->data);
603   b    = static_cast<Mat_SeqAIJ*>(B->data);
604   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
605   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
606   aa   = akok->a_dual.view_device();
607   ai   = akok->i_dual.view_device();
608   ba   = bkok->a_dual.view_device();
609   bi   = bkok->i_dual.view_device();
610   m    = A->rmap->n; /* M, N and nnz of C */
611   n    = A->cmap->n + B->cmap->n;
612   nnz  = a->nz + b->nz;
613   aN   = A->cmap->n; /* N of A */
614   if (reuse == MAT_INITIAL_MATRIX) {
615     aj = akok->j_dual.view_device();
616     bj = bkok->j_dual.view_device();
617     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
618     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
619     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
620     ca = ca_dual.view_device();
621     ci = ci_dual.view_device();
622     cj = cj_dual.view_device();
623 
624     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
625     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
626       PetscInt i = t.league_rank(); /* row i */
627       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
628 
629       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
630         ci(i) = coffset;
631         if (i == m-1) ci(m) = ai(m) + bi(m);
632       });
633 
634       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
635         if (k < alen) {
636           ca(coffset+k) = aa(ai(i)+k);
637           cj(coffset+k) = aj(ai(i)+k);
638         } else {
639           ca(coffset+k) = ba(bi(i)+k-alen);
640           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
641         }
642       });
643     });
644     ca_dual.modify_device();
645     ci_dual.modify_device();
646     cj_dual.modify_device();
647     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
648     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
649   } else if (reuse == MAT_REUSE_MATRIX) {
650     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
651     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
652     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
653     ca   = ckok->a_dual.view_device();
654     ci   = ckok->i_dual.view_device();
655 
656     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
657       PetscInt i = t.league_rank(); /* row i */
658       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
659       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
660         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
661         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
662       });
663     });
664     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
665   }
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
670 {
671   PetscFunctionBegin;
672   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
673   PetscFunctionReturn(0);
674 }
675 
676 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
677 {
678   PetscErrorCode                 ierr;
679   Mat_Product                    *product = C->product;
680   Mat                            A,B;
681   bool                           transA,transB; /* use bool, since KK needs this type */
682   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
683   Mat_SeqAIJ                     *c;
684   MatProductData_SeqAIJKokkos    *pdata;
685   KokkosCsrMatrix                *csrmatA,*csrmatB;
686 
687   PetscFunctionBegin;
688   MatCheckProduct(C,1);
689   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
690   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
691 
692   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
693     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
694     PetscFunctionReturn(0);
695   }
696 
697   switch (product->type) {
698     case MATPRODUCT_AB:  transA = false; transB = false; break;
699     case MATPRODUCT_AtB: transA = true;  transB = false; break;
700     case MATPRODUCT_ABt: transA = false; transB = true;  break;
701     default:
702       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
703   }
704 
705   A     = product->A;
706   B     = product->B;
707   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
708   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
709   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
710   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
711   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
712 
713   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
714 
715   csrmatA = &akok->csrmat;
716   csrmatB = &bkok->csrmat;
717 
718   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
719   if (transA) {
720     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
721     transA = false;
722   }
723 
724   if (transB) {
725     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
726     transB = false;
727   }
728   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
729   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
730   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
731   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
732   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
733   /* shorter version of MatAssemblyEnd_SeqAIJ */
734   c = (Mat_SeqAIJ*)C->data;
735   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
736   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
737   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
738   c->reallocs         = 0;
739   C->info.mallocs     = 0;
740   C->info.nz_unneeded = 0;
741   C->assembled        = C->was_assembled = PETSC_TRUE;
742   C->num_ass++;
743   PetscFunctionReturn(0);
744 }
745 
746 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
747 {
748   PetscErrorCode                 ierr;
749   Mat_Product                    *product = C->product;
750   MatProductType                 ptype;
751   Mat                            A,B;
752   bool                           transA,transB;
753   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
754   MatProductData_SeqAIJKokkos    *pdata;
755   MPI_Comm                       comm;
756   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
757 
758   PetscFunctionBegin;
759   MatCheckProduct(C,1);
760   ierr = PetscObjectGetComm((PetscObject)C,&comm);
761   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
762   A       = product->A;
763   B       = product->B;
764   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
765   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
766   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
767   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
768   csrmatA = &akok->csrmat;
769   csrmatB = &bkok->csrmat;
770 
771   ptype   = product->type;
772   switch (ptype) {
773     case MATPRODUCT_AB:  transA = false; transB = false; break;
774     case MATPRODUCT_AtB: transA = true;  transB = false; break;
775     case MATPRODUCT_ABt: transA = false; transB = true;  break;
776     default:
777       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
778   }
779 
780   product->data = pdata = new MatProductData_SeqAIJKokkos();
781   pdata->kh.set_team_work_size(16);
782   pdata->kh.set_dynamic_scheduling(true);
783   pdata->reusesym = product->api_user;
784 
785   /* TODO: add command line options to select spgemm algorithms */
786   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
787   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
788      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
789    */
790   pdata->kh.create_spgemm_handle(spgemm_alg);
791 
792   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
793   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
794   if (transA) {
795     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
796     transA = false;
797   }
798 
799   if (transB) {
800     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
801     transB = false;
802   }
803 
804   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
805   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
806     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
807     calling new Mat_SeqAIJKokkos().
808     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
809   */
810   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
811   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
812   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
813 
814   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
815   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
816   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
817   PetscFunctionReturn(0);
818 }
819 
820 /* handles sparse matrix matrix ops */
821 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
822 {
823   PetscErrorCode ierr;
824   Mat_Product    *product = mat->product;
825   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
826 
827   PetscFunctionBegin;
828   MatCheckProduct(mat,1);
829   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
830   if (product->type == MATPRODUCT_ABC) {
831     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
832   }
833   if (Biskok && Ciskok) {
834     switch (product->type) {
835     case MATPRODUCT_AB:
836     case MATPRODUCT_AtB:
837     case MATPRODUCT_ABt:
838       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
839       break;
840     case MATPRODUCT_PtAP:
841     case MATPRODUCT_RARt:
842     case MATPRODUCT_ABC:
843       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
844       break;
845     default:
846       break;
847     }
848   } else { /* fallback for AIJ */
849     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
850   }
851   PetscFunctionReturn(0);
852 }
853 
854 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
855 {
856   PetscErrorCode   ierr;
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
861   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
862   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
865   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
866   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
867   PetscFunctionReturn(0);
868 }
869 
870 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871 {
872   PetscErrorCode   ierr;
873   Mat_SeqAIJKokkos *aijkok;
874 
875   PetscFunctionBegin;
876   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
877   if (aijkok) { /* Only zero the device if data is already there */
878     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
879     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
880   } else { /* Might be preallocated but not assembled */
881     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
882   }
883   PetscFunctionReturn(0);
884 }
885 
886 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
887 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
888 {
889   PetscErrorCode     ierr;
890   Mat_SeqAIJKokkos   *aijkok;
891 
892   PetscFunctionBegin;
893   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
894   PetscValidPointer(kv,2);
895   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
896   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
897   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
898   *kv    = aijkok->a_dual.view_device();
899   PetscFunctionReturn(0);
900 }
901 
902 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
903 {
904   PetscFunctionBegin;
905   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
906   PetscValidPointer(kv,2);
907   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
908   PetscFunctionReturn(0);
909 }
910 
911 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
912 {
913   PetscErrorCode     ierr;
914   Mat_SeqAIJKokkos   *aijkok;
915 
916   PetscFunctionBegin;
917   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
918   PetscValidPointer(kv,2);
919   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
920   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
921   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
922   *kv    = aijkok->a_dual.view_device();
923   PetscFunctionReturn(0);
924 }
925 
926 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
927 {
928   PetscErrorCode     ierr;
929 
930   PetscFunctionBegin;
931   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
932   PetscValidPointer(kv,2);
933   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
934   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
935   PetscFunctionReturn(0);
936 }
937 
938 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
939 {
940   Mat_SeqAIJKokkos   *aijkok;
941 
942   PetscFunctionBegin;
943   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
944   PetscValidPointer(kv,2);
945   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
946   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
947   *kv    = aijkok->a_dual.view_device();
948   PetscFunctionReturn(0);
949 }
950 
951 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
952 {
953   PetscErrorCode     ierr;
954 
955   PetscFunctionBegin;
956   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
957   PetscValidPointer(kv,2);
958   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
959   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
960   PetscFunctionReturn(0);
961 }
962 
963 /* Computes Y += alpha X */
964 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
965 {
966   PetscErrorCode             ierr;
967   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
968   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
969   ConstMatScalarKokkosView   Xa;
970   MatScalarKokkosView        Ya;
971 
972   PetscFunctionBegin;
973   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
974   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
975   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
976   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
977   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
978 
979   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
980     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
981     PetscBool e;
982     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
983     if (e) {
984       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
985       if (e) pattern = SAME_NONZERO_PATTERN;
986     }
987   }
988 
989   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
990     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
991     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
992     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
993   */
994   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
995   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
996   Xa   = xkok->a_dual.view_device();
997   Ya   = ykok->a_dual.view_device();
998 
999   if (pattern == SAME_NONZERO_PATTERN) {
1000     KokkosBlas::axpy(alpha,Xa,Ya);
1001     ierr = MatSeqAIJKokkosModifyDevice(Y);
1002   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1003     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1004     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1005 
1006     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1007       PetscInt i = t.league_rank(); /* row i */
1008       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1009         PetscInt p,q = Yi(i);
1010         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1011           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1012           if (Xj(p) == Yj(q)) { /* Found it */
1013             Ya(q) += alpha * Xa(p);
1014             q++;
1015           } else {
1016             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1017                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1018             */
1019             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1020           }
1021         }
1022       });
1023     });
1024     ierr = MatSeqAIJKokkosModifyDevice(Y);
1025   } else { /* different nonzero patterns */
1026     Mat             Z;
1027     KokkosCsrMatrix zcsr;
1028     KernelHandle    kh;
1029     kh.create_spadd_handle(false);
1030     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1031     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1032     zkok = new Mat_SeqAIJKokkos(zcsr);
1033     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1034     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1035     kh.destroy_spadd_handle();
1036   }
1037   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1038   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1043 {
1044   PetscErrorCode            ierr;
1045   Mat_SeqAIJKokkos          *akok;
1046   Mat_SeqAIJ                *aseq;
1047 
1048   PetscFunctionBegin;
1049   ierr = MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1050   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1051   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1052   delete akok;
1053   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1054   ierr = MatZeroEntries_SeqAIJKokkos(mat);CHKERRQ(ierr);
1055   akok->SetUpCOO(aseq);
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1060 {
1061   PetscErrorCode              ierr;
1062   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1063   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1064   PetscCount                  Annz = aseq->nz;
1065   const PetscCountKokkosView& jmap = akok->jmap_d;
1066   const PetscCountKokkosView& perm = akok->perm_d;
1067   MatScalarKokkosView         Aa;
1068   ConstMatScalarKokkosView    kv;
1069   PetscMemType                memtype;
1070 
1071   PetscFunctionBegin;
1072   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1073   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1074     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1075   } else {
1076     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1077   }
1078 
1079   if (imode == INSERT_VALUES) {
1080     ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
1081     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1082   } else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
1083 
1084   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1085 
1086   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1087   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
1088   PetscFunctionReturn(0);
1089 }
1090 
1091 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1092 {
1093   PetscErrorCode ierr;
1094 
1095   PetscFunctionBegin;
1096   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1097   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1098   B->offloadmask = PETSC_OFFLOAD_CPU;
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1103 {
1104   PetscErrorCode     ierr;
1105   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1106 
1107   PetscFunctionBegin;
1108   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1109   A->boundtocpu  = PETSC_FALSE;
1110 
1111   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1112   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1113   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1114   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1115   A->ops->scale                     = MatScale_SeqAIJKokkos;
1116   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1117   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1118   A->ops->mult                      = MatMult_SeqAIJKokkos;
1119   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1120   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1121   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1122   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1123   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1124   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1125   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1126   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1127   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1128   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1129   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1130   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1131   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1132   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1133 
1134   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1135   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1136   PetscFunctionReturn(0);
1137 }
1138 
1139 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1140 {
1141   PetscErrorCode ierr;
1142   Mat_SeqAIJ     *aseq;
1143   PetscInt       i,m,n;
1144 
1145   PetscFunctionBegin;
1146   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1147 
1148   m    = akok->nrows();
1149   n    = akok->ncols();
1150   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1151   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1152 
1153   /* Set up data structures of A as a MATSEQAIJ */
1154   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1155   aseq = (Mat_SeqAIJ*)(A)->data;
1156 
1157   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1158   akok->j_dual.sync_host();
1159 
1160   aseq->i            = akok->i_host_data();
1161   aseq->j            = akok->j_host_data();
1162   aseq->a            = akok->a_host_data();
1163   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1164   aseq->singlemalloc = PETSC_FALSE;
1165   aseq->free_a       = PETSC_FALSE;
1166   aseq->free_ij      = PETSC_FALSE;
1167   aseq->nz           = akok->nnz();
1168   aseq->maxnz        = aseq->nz;
1169 
1170   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1171   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1172   for (i=0; i<m; i++) {
1173     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1174   }
1175 
1176   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1177   akok->nonzerostate = A->nonzerostate;
1178   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1179   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1180   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1185 
1186    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1187  */
1188 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1189 {
1190   PetscErrorCode ierr;
1191 
1192   PetscFunctionBegin;
1193   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1194   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 /* --------------------------------------------------------------------------------*/
1199 /*@C
1200    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1201    (the default parallel PETSc format). This matrix will ultimately be handled by
1202    Kokkos for calculations. For good matrix
1203    assembly performance the user should preallocate the matrix storage by setting
1204    the parameter nz (or the array nnz).  By setting these parameters accurately,
1205    performance during matrix assembly can be increased by more than a factor of 50.
1206 
1207    Collective
1208 
1209    Input Parameters:
1210 +  comm - MPI communicator, set to PETSC_COMM_SELF
1211 .  m - number of rows
1212 .  n - number of columns
1213 .  nz - number of nonzeros per row (same for all rows)
1214 -  nnz - array containing the number of nonzeros in the various rows
1215          (possibly different for each row) or NULL
1216 
1217    Output Parameter:
1218 .  A - the matrix
1219 
1220    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1221    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1222    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1223 
1224    Notes:
1225    If nnz is given then nz is ignored
1226 
1227    The AIJ format (also called the Yale sparse matrix format or
1228    compressed row storage), is fully compatible with standard Fortran 77
1229    storage.  That is, the stored row and column indices can begin at
1230    either one (as in Fortran) or zero.  See the users' manual for details.
1231 
1232    Specify the preallocated storage with either nz or nnz (not both).
1233    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1234    allocation.  For large problems you MUST preallocate memory or you
1235    will get TERRIBLE performance, see the users' manual chapter on matrices.
1236 
1237    By default, this format uses inodes (identical nodes) when possible, to
1238    improve numerical efficiency of matrix-vector products and solves. We
1239    search for consecutive rows with the same nonzero structure, thereby
1240    reusing matrix information to achieve increased efficiency.
1241 
1242    Level: intermediate
1243 
1244 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1245 @*/
1246 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1247 {
1248   PetscErrorCode ierr;
1249 
1250   PetscFunctionBegin;
1251   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1252   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1253   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1254   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1255   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1256   PetscFunctionReturn(0);
1257 }
1258 
1259 typedef Kokkos::TeamPolicy<>::member_type team_member;
1260 //
1261 // This factorization exploits block diagonal matrices with "Nf" (not used).
1262 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1263 //
1264 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1265 {
1266   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1267   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1268   IS                 isrow = b->row,isicol = b->icol;
1269   PetscErrorCode     ierr;
1270   const PetscInt     *r_h,*ic_h;
1271   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1272   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1273   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1274   PetscBool          row_identity,col_identity;
1275   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1276 
1277   PetscFunctionBegin;
1278   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1279   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1280   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1281   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1282   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1283   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1284   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1285   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1286   {
1287 #define KOKKOS_SHARED_LEVEL 1
1288     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1289     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1290     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1291     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1292     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1293     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1294     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1295     size_t flops_h = 0.0;
1296     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1297     Kokkos::View<size_t> d_flops_k ("flops");
1298     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1299     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1300     Kokkos::deep_copy (d_flops_k, h_flops_k);
1301     Kokkos::deep_copy (d_r_k, h_r_k);
1302     Kokkos::deep_copy (d_ic_k, h_ic_k);
1303     // Fill A --> fact
1304     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1305         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1306         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1307         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1308         // zero rows of B
1309         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1310             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1311             PetscScalar *baL = ba_d + bi_d[rowb];
1312             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1313             /* zero (unfactored row) */
1314             for (int j=0;j<nzbL;j++) baL[j] = 0;
1315             for (int j=0;j<nzbU;j++) baU[j] = 0;
1316           });
1317         // copy A into B
1318         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1319             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1320             const PetscScalar *av    = aa_d + ai_d[rowa];
1321             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1322             /* load in initial (unfactored row) */
1323             for (int j=0;j<nza;j++) {
1324               PetscInt    colb = ic[ajtmp[j]];
1325               PetscScalar vala = av[j];
1326               if (colb == rowb) {
1327                 *(ba_d + bdiag_d[rowb]) = vala;
1328               } else {
1329                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1330                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1331                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1332                 for (int j=0; j<nz ; j++) {
1333                   if (pbj[j] == colb) {
1334                     pba[j] = vala;
1335                     set++;
1336                     break;
1337                   }
1338                 }
1339                #if !defined(PETSC_HAVE_SYCL)
1340                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1341                #endif
1342               }
1343             }
1344           });
1345       });
1346     Kokkos::fence();
1347 
1348     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1349         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1350         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1351         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1352         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1353         const PetscInt  start = field*nloc, end = start + nloc;
1354         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1355         // A22 panel update for each row A(1,:) and col A(:,1)
1356         for (int ii=start; ii<end-1; ii++) {
1357           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1358           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1359           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1360           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1361           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1362               PetscInt kIdx = j*Ni + field_block_idx;
1363               if (kIdx >= nzUi) /* void */ ;
1364               else {
1365                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1366                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1367                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1368                 size_t         st_idx;
1369                 // find and do L(k,i) = A(:k,i) / A(i,i)
1370                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1371                 // get column, there has got to be a better way
1372                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1373                     if (pjL[j] == ii) {
1374                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1375                       idx = j; // output
1376                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1377                     }
1378                 }, st_idx);
1379                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1380 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1381                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1382 #endif
1383                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1384                 // U(i+1,:end)
1385                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1386                       PetscScalar Uij = baUi[uiIdx];
1387                       PetscInt    col = bjUi[uiIdx];
1388                       if (col==myk) {
1389                         // A_kk = A_kk - L_ki * U_ij(k)
1390                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1391                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1392                       } else {
1393                         PetscScalar    *start, *end, *pAkjv=NULL;
1394                         PetscInt       high, low;
1395                         const PetscInt *startj;
1396                         if (col<myk) { // L
1397                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1398                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1399                           start = pLki+1; // start at pLki+1, A22(myk,1)
1400                           startj= bj_d + bi_d[myk] + idx;
1401                           end   = ba_d + bi_d[myk+1];
1402                         } else {
1403                           PetscInt idx = bdiag_d[myk+1]+1;
1404                           start = ba_d + idx;
1405                           startj= bj_d + idx;
1406                           end   = ba_d + bdiag_d[myk];
1407                         }
1408                         // search for 'col', use bisection search - TODO
1409                         low  = 0;
1410                         high = (PetscInt)(end-start);
1411                         while (high-low > 5) {
1412                           int t = (low+high)/2;
1413                           if (startj[t] > col) high = t;
1414                           else                 low  = t;
1415                         }
1416                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1417                           if (startj[pAkjv-start] == col) break;
1418                         }
1419 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1420                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1421 #endif
1422                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1423                       }
1424                     });
1425               }
1426             });
1427           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1428           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1429         } /* endof for (i=0; i<n; i++) { */
1430         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1431       });
1432     Kokkos::fence();
1433     Kokkos::deep_copy (h_flops_k, d_flops_k);
1434     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1435     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1436         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1437         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1438         /* Invert diagonal for simpler triangular solves */
1439         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1440             int i = start + outer_index*Ni + lg_rank%Ni;
1441             if (i < end) {
1442               PetscScalar *pv = ba_d + bdiag_d[i];
1443               *pv = 1.0/(*pv);
1444             }
1445           });
1446       });
1447   }
1448   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1449   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1450   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1451 
1452   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1453   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1454   if (b->inode.size) {
1455     B->ops->solve = MatSolve_SeqAIJ_Inode;
1456   } else if (row_identity && col_identity) {
1457     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1458   } else {
1459     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1460   }
1461   B->offloadmask = PETSC_OFFLOAD_GPU;
1462   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1463   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1464   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1465   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1466   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1467   B->assembled              = PETSC_TRUE;
1468   B->preallocated           = PETSC_TRUE;
1469 
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1474 {
1475   PetscErrorCode   ierr;
1476 
1477   PetscFunctionBegin;
1478   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1479   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1484 {
1485   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1486 
1487   PetscFunctionBegin;
1488   if (!factors->sptrsv_symbolic_completed) {
1489     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1490     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1491     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1492   }
1493   PetscFunctionReturn(0);
1494 }
1495 
1496 /* Check if we need to update factors etc for transpose solve */
1497 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1498 {
1499   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1500   MatColIdxType             n = A->rmap->n;
1501 
1502   PetscFunctionBegin;
1503   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1504     /* Update L^T and do sptrsv symbolic */
1505     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1506     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1507     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1508     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1509 
1510     KokkosKernels::Impl::transpose_matrix<
1511       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1512       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1513       MatRowMapKokkosView,DefaultExecutionSpace>(
1514         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1515         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1516 
1517     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1518       We have to sort the indices, until KK provides finer control options.
1519     */
1520     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1521       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1522         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1523 
1524     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1525 
1526     /* Update U^T and do sptrsv symbolic */
1527     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1528     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1529     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1530     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1531 
1532     KokkosKernels::Impl::transpose_matrix<
1533       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1534       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1535       MatRowMapKokkosView,DefaultExecutionSpace>(
1536         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1537         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1538 
1539     /* Sort indices. See comments above */
1540     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1541       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1542         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1543 
1544     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1545     factors->transpose_updated = PETSC_TRUE;
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 /* Solve Ax = b, with A = LU */
1551 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1552 {
1553   PetscErrorCode                 ierr;
1554   ConstPetscScalarKokkosView     bv;
1555   PetscScalarKokkosView          xv;
1556   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1557 
1558   PetscFunctionBegin;
1559   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1560   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1561   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1562   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1563   /* Solve L tmpv = b */
1564   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1565   /* Solve Ux = tmpv */
1566   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1567   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1568   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1569   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 /* Solve A^T x = b, where A^T = U^T L^T */
1574 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1575 {
1576   PetscErrorCode                 ierr;
1577   ConstPetscScalarKokkosView     bv;
1578   PetscScalarKokkosView          xv;
1579   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1580 
1581   PetscFunctionBegin;
1582   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1583   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1584   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1585   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1586   /* Solve U^T tmpv = b */
1587   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1588 
1589   /* Solve L^T x = tmpv */
1590   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1591   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1592   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1593   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1598 {
1599   PetscErrorCode                 ierr;
1600   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1601   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1602   PetscInt                       fill_lev = info->levels;
1603 
1604   PetscFunctionBegin;
1605   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1606   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1607 
1608   auto a_d = aijkok->a_dual.view_device();
1609   auto i_d = aijkok->i_dual.view_device();
1610   auto j_d = aijkok->j_dual.view_device();
1611 
1612   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1613 
1614   B->assembled                       = PETSC_TRUE;
1615   B->preallocated                    = PETSC_TRUE;
1616   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1617   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1618   B->ops->matsolve                   = NULL;
1619   B->ops->matsolvetranspose          = NULL;
1620   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1621 
1622   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1623   factors->transpose_updated         = PETSC_FALSE;
1624   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1625   /* TODO: log flops, but how to know that? */
1626   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1631 {
1632   PetscErrorCode                 ierr;
1633   Mat_SeqAIJKokkos               *aijkok;
1634   Mat_SeqAIJ                     *b;
1635   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1636   PetscInt                       fill_lev = info->levels;
1637   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1638   PetscInt                       n = A->rmap->n;
1639 
1640   PetscFunctionBegin;
1641   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1642   /* Rebuild factors */
1643   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1644   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1645 
1646   /* Create a spiluk handle and then do symbolic factorization */
1647   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1648   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1649 
1650   auto spiluk_handle = factors->kh.get_spiluk_handle();
1651 
1652   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1653   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1654   Kokkos::realloc(factors->iU_d,n+1);
1655   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1656 
1657   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1658   auto i_d = aijkok->i_dual.view_device();
1659   auto j_d = aijkok->j_dual.view_device();
1660   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1661   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1662 
1663   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1664   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1665   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1666   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1667 
1668   /* TODO: add options to select sptrsv algorithms */
1669   /* Create sptrsv handles for L, U and their transpose */
1670  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1671   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1672  #else
1673   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1674  #endif
1675 
1676   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1677   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1678   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1679   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1680 
1681   /* Fill fields of the factor matrix B */
1682   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1683   b     = (Mat_SeqAIJ*)B->data;
1684   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1685   B->info.fill_ratio_given  = info->fill;
1686   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1687 
1688   B->offloadmask            = PETSC_OFFLOAD_GPU;
1689   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1690   PetscFunctionReturn(0);
1691 }
1692 
1693 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1694 {
1695   PetscErrorCode   ierr;
1696   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1697   const PetscInt   nrows   = A->rmap->n;
1698 
1699   PetscFunctionBegin;
1700   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1701   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1702   // move B data into Kokkos
1703   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1704   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1705   {
1706     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1707     if (!baijkok->diag_d.extent(0)) {
1708       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1709       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1710       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1711     }
1712   }
1713   PetscFunctionReturn(0);
1714 }
1715 
1716 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1717 {
1718   PetscFunctionBegin;
1719   *type = MATSOLVERKOKKOS;
1720   PetscFunctionReturn(0);
1721 }
1722 
1723 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1724 {
1725   PetscFunctionBegin;
1726   *type = MATSOLVERKOKKOSDEVICE;
1727   PetscFunctionReturn(0);
1728 }
1729 
1730 /*MC
1731   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1732   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1733 
1734   Level: beginner
1735 
1736 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1737 M*/
1738 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1739 {
1740   PetscErrorCode ierr;
1741   PetscInt       n = A->rmap->n;
1742 
1743   PetscFunctionBegin;
1744   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1745   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1746   (*B)->factortype = ftype;
1747   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1748   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1749 
1750   if (ftype == MAT_FACTOR_LU) {
1751     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1752     (*B)->canuseordering         = PETSC_TRUE;
1753     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1754   } else if (ftype == MAT_FACTOR_ILU) {
1755     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1756     (*B)->canuseordering         = PETSC_FALSE;
1757     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1758   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1759 
1760   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1761   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1766 {
1767   PetscErrorCode ierr;
1768   PetscInt       n = A->rmap->n;
1769 
1770   PetscFunctionBegin;
1771   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1772   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1773   (*B)->factortype = ftype;
1774   (*B)->canuseordering = PETSC_TRUE;
1775   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1776   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1777 
1778   if (ftype == MAT_FACTOR_LU) {
1779     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1780     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1781   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1782 
1783   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1784   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1785   PetscFunctionReturn(0);
1786 }
1787 
1788 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1789 {
1790   PetscErrorCode ierr;
1791 
1792   PetscFunctionBegin;
1793   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1794   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1795   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 /* Utility to print out a KokkosCsrMatrix for debugging */
1800 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1801 {
1802   PetscErrorCode    ierr;
1803   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1804   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1805   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1806   const PetscInt    *i = iv.data();
1807   const PetscInt    *j = jv.data();
1808   const PetscScalar *a = av.data();
1809   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1810 
1811   PetscFunctionBegin;
1812   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1813   for (PetscInt k=0; k<m; k++) {
1814     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1815     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1816       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1817     }
1818     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1819   }
1820   PetscFunctionReturn(0);
1821 }
1822