xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 115dd874ad11c2dfdfe8de0e926d279faa2e2e8e)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/aij.h>
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
22 
23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25    In the latter case, it is important to set a_dual's sync state correctly.
26  */
27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
28 {
29   PetscErrorCode    ierr;
30   Mat_SeqAIJ        *aijseq;
31   Mat_SeqAIJKokkos  *aijkok;
32 
33   PetscFunctionBegin;
34   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
35   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36 
37   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39 
40   /* If aijkok does not exist, we just copy i, j to device.
41      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42      In both cases, we build a new aijkok structure.
43   */
44   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45     delete aijkok;
46     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47     A->spptr = aijkok;
48   }
49 
50   if (aijkok && aijkok->device_mat_d.data()) {
51     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52   }
53   PetscFunctionReturn(0);
54 }
55 
56 /* Sync CSR data to device if not yet */
57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
58 {
59   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60 
61   PetscFunctionBegin;
62   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65   if (aijkok->a_dual.need_sync_device()) {
66     aijkok->a_dual.sync_device();
67     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
68     aijkok->hermitian_updated = PETSC_FALSE;
69   }
70   PetscFunctionReturn(0);
71 }
72 
73 /* Mark the CSR data on device as modified */
74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
75 {
76   PetscErrorCode   ierr;
77   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
78 
79   PetscFunctionBegin;
80   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
81   aijkok->a_dual.clear_sync_state();
82   aijkok->a_dual.modify_device();
83   aijkok->transpose_updated = PETSC_FALSE;
84   aijkok->hermitian_updated = PETSC_FALSE;
85   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
86   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
87   PetscFunctionReturn(0);
88 }
89 
90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91 {
92   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93 
94   PetscFunctionBegin;
95   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
96    /* We do not expect one needs factors on host  */
97   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99   aijkok->a_dual.sync_host();
100   PetscFunctionReturn(0);
101 }
102 
103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104 {
105   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106 
107   PetscFunctionBegin;
108   if (aijkok) {
109     aijkok->a_dual.sync_host();
110     *array = aijkok->a_dual.view_host().data();
111   } else { /* Happens when calling MatSetValues on a newly created matrix */
112     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118 {
119   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120 
121   PetscFunctionBegin;
122   if (aijkok) aijkok->a_dual.modify_host();
123   PetscFunctionReturn(0);
124 }
125 
126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok) {
132     aijkok->a_dual.sync_host();
133     *array = aijkok->a_dual.view_host().data();
134   } else {
135     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141 {
142   PetscFunctionBegin;
143   *array = NULL;
144   PetscFunctionReturn(0);
145 }
146 
147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148 {
149   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150 
151   PetscFunctionBegin;
152   if (aijkok) {
153     *array = aijkok->a_dual.view_host().data();
154   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
155     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161 {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok) {
166     aijkok->a_dual.clear_sync_state();
167     aijkok->a_dual.modify_host();
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174 {
175   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177 
178   PetscFunctionBegin;
179   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187 {
188   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189 
190   PetscFunctionBegin;
191   if (aijkok && aijkok->device_mat_d.data()) {
192     *d_mat = aijkok->device_mat_d.data();
193   } else {
194     PetscErrorCode   ierr;
195     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196     *d_mat  = NULL;
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 /* Generate the transpose on device and cache it internally */
202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203 {
204   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205 
206   PetscFunctionBegin;
207   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210     CHKERRCXX(aijkok->a_dual.sync_device());
211     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
213     aijkok->transpose_updated = PETSC_TRUE;
214   }
215   *csrmatT = &aijkok->csrmatT;
216   PetscFunctionReturn(0);
217 }
218 
219 /* Generate the Hermitian on device and cache it internally */
220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221 {
222   PetscErrorCode                   ierr;
223   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
224 
225   PetscFunctionBegin;
226   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
227   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
228   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
229     CHKERRCXX(aijkok->a_dual.sync_device());
230     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
231     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
232    #if defined(PETSC_USE_COMPLEX)
233     const auto& a = aijkok->csrmatH.values;
234     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
235    #endif
236     aijkok->hermitian_updated = PETSC_TRUE;
237   }
238   *csrmatH = &aijkok->csrmatH;
239   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 /* y = A x */
244 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
245 {
246   PetscErrorCode                   ierr;
247   Mat_SeqAIJKokkos                 *aijkok;
248   ConstPetscScalarKokkosView       xv;
249   PetscScalarKokkosView            yv;
250 
251   PetscFunctionBegin;
252   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
255   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
257   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
258   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
259   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
260   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
261   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 /* y = A^T x */
266 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
267 {
268   PetscErrorCode                   ierr;
269   Mat_SeqAIJKokkos                 *aijkok;
270   const char                       *mode;
271   ConstPetscScalarKokkosView       xv;
272   PetscScalarKokkosView            yv;
273   KokkosCsrMatrix                  *csrmat;
274 
275   PetscFunctionBegin;
276   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
277   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
280   if (A->form_explicit_transpose) {
281     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282     mode = "N";
283   } else {
284     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285     csrmat = &aijkok->csrmat;
286     mode = "T";
287   }
288   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
291   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 /* y = A^H x */
297 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
298 {
299   PetscErrorCode                   ierr;
300   Mat_SeqAIJKokkos                 *aijkok;
301   const char                       *mode;
302   ConstPetscScalarKokkosView       xv;
303   PetscScalarKokkosView            yv;
304   KokkosCsrMatrix                  *csrmat;
305 
306   PetscFunctionBegin;
307   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
308   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
311   if (A->form_explicit_transpose) {
312     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313     mode = "N";
314   } else {
315     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316     csrmat = &aijkok->csrmat;
317     mode = "C";
318   }
319   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 /* z = A x + y */
328 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
329 {
330   PetscErrorCode                   ierr;
331   Mat_SeqAIJKokkos                 *aijkok;
332   ConstPetscScalarKokkosView       xv,yv;
333   PetscScalarKokkosView            zv;
334 
335   PetscFunctionBegin;
336   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
337   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
341   if (zz != yy) Kokkos::deep_copy(zv,yv);
342   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349   PetscFunctionReturn(0);
350 }
351 
352 /* z = A^T x + y */
353 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
354 {
355   PetscErrorCode                   ierr;
356   Mat_SeqAIJKokkos                 *aijkok;
357   const char                       *mode;
358   ConstPetscScalarKokkosView       xv,yv;
359   PetscScalarKokkosView            zv;
360   KokkosCsrMatrix                  *csrmat;
361 
362   PetscFunctionBegin;
363   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
364   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
368   if (zz != yy) Kokkos::deep_copy(zv,yv);
369   if (A->form_explicit_transpose) {
370     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371     mode = "N";
372   } else {
373     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374     csrmat = &aijkok->csrmat;
375     mode = "T";
376   }
377   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
381   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
383   PetscFunctionReturn(0);
384 }
385 
386 /* z = A^H x + y */
387 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
388 {
389   PetscErrorCode                   ierr;
390   Mat_SeqAIJKokkos                 *aijkok;
391   const char                       *mode;
392   ConstPetscScalarKokkosView       xv,yv;
393   PetscScalarKokkosView            zv;
394   KokkosCsrMatrix                  *csrmat;
395 
396   PetscFunctionBegin;
397   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
398   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
402   if (zz != yy) Kokkos::deep_copy(zv,yv);
403   if (A->form_explicit_transpose) {
404     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405     mode = "N";
406   } else {
407     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408     csrmat = &aijkok->csrmat;
409     mode = "C";
410   }
411   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
415   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421 {
422   PetscErrorCode            ierr;
423   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424 
425   PetscFunctionBegin;
426   switch (op) {
427     case MAT_FORM_EXPLICIT_TRANSPOSE:
428       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430       A->form_explicit_transpose = flg;
431       break;
432     default:
433       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434       break;
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Depending on reuse, either build a new mat, or use the existing mat */
440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
441 {
442   PetscErrorCode   ierr;
443   Mat_SeqAIJ       *aseq;
444 
445   PetscFunctionBegin;
446   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
449   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
459       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
461     }
462   }
463   PetscFunctionReturn(0);
464 }
465 
466 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468  */
469 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
470 {
471   PetscErrorCode        ierr;
472   Mat_SeqAIJ            *bseq;
473   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474   Mat                   mat;
475 
476   PetscFunctionBegin;
477   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479   mat  = *B;
480   if (A->assembled) {
481     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484     /* Now copy values to B if needed */
485     if (dupOption == MAT_COPY_VALUES) {
486       if (akok->a_dual.need_sync_device()) {
487         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488         bkok->a_dual.modify_host();
489       } else { /* If device has the latest data, we only copy data on device */
490         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491         bkok->a_dual.modify_device();
492       }
493     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495       bkok->a_dual.modify_host();
496     }
497     mat->spptr = bkok;
498   }
499 
500   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
508 {
509   PetscErrorCode    ierr;
510   Mat               At;
511   KokkosCsrMatrix   *internT;
512   Mat_SeqAIJKokkos  *atkok,*bkok;
513 
514   PetscFunctionBegin;
515   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
516   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517     /* Deep copy internT, as we want to isolate the internal transpose */
518     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
519     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
520     if (reuse == MAT_INITIAL_MATRIX) *B = At;
521     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
522   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
523     if ((*B)->assembled) {
524       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
525       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
526       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
527     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
528       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
529       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
530       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
531       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
532       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
533     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
534   }
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
539 {
540   PetscErrorCode             ierr;
541   Mat_SeqAIJKokkos           *aijkok;
542 
543   PetscFunctionBegin;
544   if (A->factortype == MAT_FACTOR_NONE) {
545     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546     delete aijkok;
547   } else {
548     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
549   }
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
552   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
553   A->spptr = NULL;
554   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
555   PetscFunctionReturn(0);
556 }
557 
558 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
559 {
560   PetscErrorCode ierr;
561 
562   PetscFunctionBegin;
563   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
564   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
565   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
566   PetscFunctionReturn(0);
567 }
568 
569 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
570 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
571 {
572   PetscErrorCode               ierr;
573   Mat_SeqAIJ                   *a,*b;
574   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
575   MatScalarKokkosView          aa,ba,ca;
576   MatRowMapKokkosView          ai,bi,ci;
577   MatColIdxKokkosView          aj,bj,cj;
578   PetscInt                     m,n,nnz,aN;
579 
580   PetscFunctionBegin;
581   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
582   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
583   PetscValidPointer(C,4);
584   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
585   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
586   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
587   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
588 
589   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
590   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
591   a    = static_cast<Mat_SeqAIJ*>(A->data);
592   b    = static_cast<Mat_SeqAIJ*>(B->data);
593   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
594   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
595   aa   = akok->a_dual.view_device();
596   ai   = akok->i_dual.view_device();
597   ba   = bkok->a_dual.view_device();
598   bi   = bkok->i_dual.view_device();
599   m    = A->rmap->n; /* M, N and nnz of C */
600   n    = A->cmap->n + B->cmap->n;
601   nnz  = a->nz + b->nz;
602   aN   = A->cmap->n; /* N of A */
603   if (reuse == MAT_INITIAL_MATRIX) {
604     aj = akok->j_dual.view_device();
605     bj = bkok->j_dual.view_device();
606     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
607     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
608     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
609     ca = ca_dual.view_device();
610     ci = ci_dual.view_device();
611     cj = cj_dual.view_device();
612 
613     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
614     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
615       PetscInt i = t.league_rank(); /* row i */
616       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
617 
618       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
619         ci(i) = coffset;
620         if (i == m-1) ci(m) = ai(m) + bi(m);
621       });
622 
623       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
624         if (k < alen) {
625           ca(coffset+k) = aa(ai(i)+k);
626           cj(coffset+k) = aj(ai(i)+k);
627         } else {
628           ca(coffset+k) = ba(bi(i)+k-alen);
629           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
630         }
631       });
632     });
633     ca_dual.modify_device();
634     ci_dual.modify_device();
635     cj_dual.modify_device();
636     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
637     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
638   } else if (reuse == MAT_REUSE_MATRIX) {
639     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
640     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
641     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
642     ca   = ckok->a_dual.view_device();
643     ci   = ckok->i_dual.view_device();
644 
645     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
646       PetscInt i = t.league_rank(); /* row i */
647       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
648       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
649         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
650         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
651       });
652     });
653     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
659 {
660   PetscFunctionBegin;
661   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
666 {
667   PetscErrorCode                 ierr;
668   Mat_Product                    *product = C->product;
669   Mat                            A,B;
670   bool                           transA,transB; /* use bool, since KK needs this type */
671   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
672   Mat_SeqAIJ                     *c;
673   MatProductData_SeqAIJKokkos    *pdata;
674   KokkosCsrMatrix                *csrmatA,*csrmatB;
675 
676   PetscFunctionBegin;
677   MatCheckProduct(C,1);
678   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
679   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
680 
681   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
682     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
683     PetscFunctionReturn(0);
684   }
685 
686   switch (product->type) {
687     case MATPRODUCT_AB:  transA = false; transB = false; break;
688     case MATPRODUCT_AtB: transA = true;  transB = false; break;
689     case MATPRODUCT_ABt: transA = false; transB = true;  break;
690     default:
691       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
692   }
693 
694   A     = product->A;
695   B     = product->B;
696   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
697   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
698   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
699   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
700   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
701 
702   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
703 
704   csrmatA = &akok->csrmat;
705   csrmatB = &bkok->csrmat;
706 
707   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
708   if (transA) {
709     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
710     transA = false;
711   }
712 
713   if (transB) {
714     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
715     transB = false;
716   }
717   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
718   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
719   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
720   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
721   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
722   /* shorter version of MatAssemblyEnd_SeqAIJ */
723   c = (Mat_SeqAIJ*)C->data;
724   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
725   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
726   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
727   c->reallocs         = 0;
728   C->info.mallocs     = 0;
729   C->info.nz_unneeded = 0;
730   C->assembled        = C->was_assembled = PETSC_TRUE;
731   C->num_ass++;
732   PetscFunctionReturn(0);
733 }
734 
735 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
736 {
737   PetscErrorCode                 ierr;
738   Mat_Product                    *product = C->product;
739   MatProductType                 ptype;
740   Mat                            A,B;
741   bool                           transA,transB;
742   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
743   MatProductData_SeqAIJKokkos    *pdata;
744   MPI_Comm                       comm;
745   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
746 
747   PetscFunctionBegin;
748   MatCheckProduct(C,1);
749   ierr = PetscObjectGetComm((PetscObject)C,&comm);
750   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
751   A       = product->A;
752   B       = product->B;
753   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
754   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
755   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
756   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
757   csrmatA = &akok->csrmat;
758   csrmatB = &bkok->csrmat;
759 
760   ptype   = product->type;
761   switch (ptype) {
762     case MATPRODUCT_AB:  transA = false; transB = false; break;
763     case MATPRODUCT_AtB: transA = true;  transB = false; break;
764     case MATPRODUCT_ABt: transA = false; transB = true;  break;
765     default:
766       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
767   }
768 
769   product->data = pdata = new MatProductData_SeqAIJKokkos();
770   pdata->kh.set_team_work_size(16);
771   pdata->kh.set_dynamic_scheduling(true);
772   pdata->reusesym = product->api_user;
773 
774   /* TODO: add command line options to select spgemm algorithms */
775   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
776   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
777      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
778    */
779   pdata->kh.create_spgemm_handle(spgemm_alg);
780 
781   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
782   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
783   if (transA) {
784     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
785     transA = false;
786   }
787 
788   if (transB) {
789     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
790     transB = false;
791   }
792 
793   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
794   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
795     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
796     calling new Mat_SeqAIJKokkos().
797     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
798   */
799   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
800   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
801   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
802 
803   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
804   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
805   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
806   PetscFunctionReturn(0);
807 }
808 
809 /* handles sparse matrix matrix ops */
810 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
811 {
812   PetscErrorCode ierr;
813   Mat_Product    *product = mat->product;
814   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
815 
816   PetscFunctionBegin;
817   MatCheckProduct(mat,1);
818   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
819   if (product->type == MATPRODUCT_ABC) {
820     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
821   }
822   if (Biskok && Ciskok) {
823     switch (product->type) {
824     case MATPRODUCT_AB:
825     case MATPRODUCT_AtB:
826     case MATPRODUCT_ABt:
827       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
828       break;
829     case MATPRODUCT_PtAP:
830     case MATPRODUCT_RARt:
831     case MATPRODUCT_ABC:
832       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
833       break;
834     default:
835       break;
836     }
837   } else { /* fallback for AIJ */
838     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
839   }
840   PetscFunctionReturn(0);
841 }
842 
843 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
844 {
845   PetscErrorCode   ierr;
846   Mat_SeqAIJKokkos *aijkok;
847 
848   PetscFunctionBegin;
849   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
850   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
851   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
852   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
853   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
854   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
855   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 
859 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
860 {
861   PetscErrorCode   ierr;
862   Mat_SeqAIJKokkos *aijkok;
863 
864   PetscFunctionBegin;
865   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
866   if (aijkok) { /* Only zero the device if data is already there */
867     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
868     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
869   } else { /* Might be preallocated but not assembled */
870     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
871   }
872   PetscFunctionReturn(0);
873 }
874 
875 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
876 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
877 {
878   PetscErrorCode     ierr;
879   Mat_SeqAIJKokkos   *aijkok;
880 
881   PetscFunctionBegin;
882   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
883   PetscValidPointer(kv,2);
884   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
885   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
886   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887   *kv    = aijkok->a_dual.view_device();
888   PetscFunctionReturn(0);
889 }
890 
891 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
892 {
893   PetscFunctionBegin;
894   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
895   PetscValidPointer(kv,2);
896   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
901 {
902   PetscErrorCode     ierr;
903   Mat_SeqAIJKokkos   *aijkok;
904 
905   PetscFunctionBegin;
906   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
907   PetscValidPointer(kv,2);
908   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
909   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
910   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
911   *kv    = aijkok->a_dual.view_device();
912   PetscFunctionReturn(0);
913 }
914 
915 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
916 {
917   PetscErrorCode     ierr;
918 
919   PetscFunctionBegin;
920   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
921   PetscValidPointer(kv,2);
922   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
923   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
928 {
929   Mat_SeqAIJKokkos   *aijkok;
930 
931   PetscFunctionBegin;
932   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
933   PetscValidPointer(kv,2);
934   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
935   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
936   *kv    = aijkok->a_dual.view_device();
937   PetscFunctionReturn(0);
938 }
939 
940 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
941 {
942   PetscErrorCode     ierr;
943 
944   PetscFunctionBegin;
945   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
946   PetscValidPointer(kv,2);
947   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
948   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
949   PetscFunctionReturn(0);
950 }
951 
952 /* Computes Y += alpha X */
953 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
954 {
955   PetscErrorCode             ierr;
956   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
957   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
958   ConstMatScalarKokkosView   Xa;
959   MatScalarKokkosView        Ya;
960 
961   PetscFunctionBegin;
962   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
963   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
964   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
965   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
966   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
967 
968   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
969     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
970     PetscBool e;
971     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
972     if (e) {
973       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
974       if (e) pattern = SAME_NONZERO_PATTERN;
975     }
976   }
977 
978   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
979     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
980     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
981     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
982   */
983   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
984   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
985   Xa   = xkok->a_dual.view_device();
986   Ya   = ykok->a_dual.view_device();
987 
988   if (pattern == SAME_NONZERO_PATTERN) {
989     KokkosBlas::axpy(alpha,Xa,Ya);
990     ierr = MatSeqAIJKokkosModifyDevice(Y);
991   } else if (pattern == SUBSET_NONZERO_PATTERN) {
992     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
993     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
994 
995     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
996       PetscInt i = t.league_rank(); /* row i */
997       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
998         PetscInt p,q = Yi(i);
999         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1000           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1001           if (Xj(p) == Yj(q)) { /* Found it */
1002             Ya(q) += alpha * Xa(p);
1003             q++;
1004           } else {
1005             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1006                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1007             */
1008             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1009           }
1010         }
1011       });
1012     });
1013     ierr = MatSeqAIJKokkosModifyDevice(Y);
1014   } else { /* different nonzero patterns */
1015     Mat             Z;
1016     KokkosCsrMatrix zcsr;
1017     KernelHandle    kh;
1018     kh.create_spadd_handle(false);
1019     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1020     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1021     zkok = new Mat_SeqAIJKokkos(zcsr);
1022     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1023     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1024     kh.destroy_spadd_handle();
1025   }
1026   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1027   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1032 {
1033   PetscErrorCode            ierr;
1034   MPI_Comm                  comm;
1035   PetscInt                  *i,*j,*perm,*jmap;
1036   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
1037   PetscBool                 has_repeats = PETSC_FALSE;
1038   PetscInt                  *Ai,*Aj;
1039   PetscScalar               *Aa;
1040   Mat_SeqAIJKokkos          *akok;
1041   Mat_SeqAIJ                *aseq;
1042   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
1043   Mat                       newmat;
1044 
1045   PetscFunctionBegin;
1046   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1047   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1048   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
1049   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1050   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
1051   perm = perm_h.data();
1052   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
1053     if (j[k] < 0) i[k] = -1;
1054     perm[k] = k;
1055   }
1056 
1057   /* Sort by row */
1058   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
1059   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
1060   nneg = k;
1061   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
1062   nnz  = 0; /* Total number of unique nonzeros to be counted */
1063   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
1064   jmap++; /* Inc jmap by 1 for convinience */
1065 
1066   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
1067   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
1068 
1069   /* In each row, sort by column, then unique column indices to get row length */
1070   Ai++; /* Inc by 1 for convinience */
1071   q = 0; /* q-th unique nonzero, with q starting from 0 */
1072   while (k<n) {
1073     row   = i[k];
1074     start = k; /* [start,end) indices for this row */
1075     while (k<n && i[k] == row) k++;
1076     end   = k;
1077     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
1078     /* Find number of unique col entries in this row */
1079     Aj[q]   = j[start]; /* Log the first nonzero in this row */
1080     jmap[q] = 1; /* Number of repeats of this nozero entry */
1081     Ai[row] = 1;
1082     nnz++;
1083 
1084     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
1085       if (j[p] != j[p-1]) { /* Meet a new nonzero */
1086         q++;
1087         jmap[q] = 1;
1088         Aj[q]   = j[p];
1089         Ai[row]++;
1090         nnz++;
1091       } else {
1092         jmap[q]++;
1093         has_repeats = PETSC_TRUE;
1094       }
1095     }
1096     q++; /* Move to next row and thus next unique nonzero */
1097   }
1098   ierr = PetscFree2(i,j);CHKERRQ(ierr);
1099 
1100   Ai--; /* Back to the beginning of Ai[] */
1101   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
1102   jmap--; /* Back to the beginning of jmap[] */
1103   jmap[0] = 0;
1104   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
1105     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
1106     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
1107   }
1108 
1109   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
1110     PetscInt *Aj_new;
1111     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
1112     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
1113     ierr = PetscFree(Aj);CHKERRQ(ierr);
1114     Aj = Aj_new;
1115   }
1116 
1117   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
1118   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
1119   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
1120   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
1121   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
1122 
1123   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1124   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1125   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1126 
1127   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
1128     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
1129     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
1130     perm_h = newperm_h;
1131   }
1132 
1133   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1134   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
1135   PetscFunctionReturn(0);
1136 }
1137 
1138 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1139 {
1140   PetscErrorCode              ierr;
1141   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1142   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1143   PetscInt                    nz = aseq->nz;
1144   const MatRowMapKokkosView&  jmap = akok->jmap_d;
1145   const MatRowMapKokkosView&  perm = akok->perm_d;
1146   MatScalarKokkosView         Aa;
1147   ConstMatScalarKokkosView    kv;
1148   PetscMemType                memtype;
1149 
1150   PetscFunctionBegin;
1151   if (!v) { /* NULL v means an all zero array */
1152     ierr = MatZeroEntries(A);CHKERRQ(ierr);
1153     PetscFunctionReturn(0);
1154   }
1155 
1156   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1157   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1158     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
1159   } else {
1160     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
1161   }
1162 
1163   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1164   if (imode == INSERT_VALUES) {
1165     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1166   }
1167 
1168   if (akok->coo_has_repeats) {
1169     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
1170       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1171     });
1172   } else {
1173     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
1174   }
1175   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1176   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1177   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1182 {
1183   PetscErrorCode ierr;
1184 
1185   PetscFunctionBegin;
1186   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1187   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1188   B->offloadmask = PETSC_OFFLOAD_CPU;
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1193 {
1194   PetscErrorCode     ierr;
1195   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1196 
1197   PetscFunctionBegin;
1198   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1199   A->boundtocpu  = PETSC_FALSE;
1200 
1201   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1202   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1203   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1204   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1205   A->ops->scale                     = MatScale_SeqAIJKokkos;
1206   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1207   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1208   A->ops->mult                      = MatMult_SeqAIJKokkos;
1209   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1210   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1211   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1212   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1213   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1214   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1215   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1216   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1217   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1218   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1219   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1220   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1221   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1222   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1223 
1224   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1225   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1226   PetscFunctionReturn(0);
1227 }
1228 
1229 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1230 {
1231   PetscErrorCode ierr;
1232   Mat_SeqAIJ     *aseq;
1233   PetscInt       i,m,n;
1234 
1235   PetscFunctionBegin;
1236   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1237 
1238   m    = akok->nrows();
1239   n    = akok->ncols();
1240   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1241   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1242 
1243   /* Set up data structures of A as a MATSEQAIJ */
1244   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1245   aseq = (Mat_SeqAIJ*)(A)->data;
1246 
1247   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1248   akok->j_dual.sync_host();
1249 
1250   aseq->i            = akok->i_host_data();
1251   aseq->j            = akok->j_host_data();
1252   aseq->a            = akok->a_host_data();
1253   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1254   aseq->singlemalloc = PETSC_FALSE;
1255   aseq->free_a       = PETSC_FALSE;
1256   aseq->free_ij      = PETSC_FALSE;
1257   aseq->nz           = akok->nnz();
1258   aseq->maxnz        = aseq->nz;
1259 
1260   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1261   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1262   for (i=0; i<m; i++) {
1263     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1264   }
1265 
1266   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1267   akok->nonzerostate = A->nonzerostate;
1268   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1269   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1270   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1271   PetscFunctionReturn(0);
1272 }
1273 
1274 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1275 
1276    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1277  */
1278 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1279 {
1280   PetscErrorCode ierr;
1281 
1282   PetscFunctionBegin;
1283   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1284   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 /* --------------------------------------------------------------------------------*/
1289 /*@C
1290    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1291    (the default parallel PETSc format). This matrix will ultimately be handled by
1292    Kokkos for calculations. For good matrix
1293    assembly performance the user should preallocate the matrix storage by setting
1294    the parameter nz (or the array nnz).  By setting these parameters accurately,
1295    performance during matrix assembly can be increased by more than a factor of 50.
1296 
1297    Collective
1298 
1299    Input Parameters:
1300 +  comm - MPI communicator, set to PETSC_COMM_SELF
1301 .  m - number of rows
1302 .  n - number of columns
1303 .  nz - number of nonzeros per row (same for all rows)
1304 -  nnz - array containing the number of nonzeros in the various rows
1305          (possibly different for each row) or NULL
1306 
1307    Output Parameter:
1308 .  A - the matrix
1309 
1310    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1311    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1312    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1313 
1314    Notes:
1315    If nnz is given then nz is ignored
1316 
1317    The AIJ format (also called the Yale sparse matrix format or
1318    compressed row storage), is fully compatible with standard Fortran 77
1319    storage.  That is, the stored row and column indices can begin at
1320    either one (as in Fortran) or zero.  See the users' manual for details.
1321 
1322    Specify the preallocated storage with either nz or nnz (not both).
1323    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1324    allocation.  For large problems you MUST preallocate memory or you
1325    will get TERRIBLE performance, see the users' manual chapter on matrices.
1326 
1327    By default, this format uses inodes (identical nodes) when possible, to
1328    improve numerical efficiency of matrix-vector products and solves. We
1329    search for consecutive rows with the same nonzero structure, thereby
1330    reusing matrix information to achieve increased efficiency.
1331 
1332    Level: intermediate
1333 
1334 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1335 @*/
1336 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1337 {
1338   PetscErrorCode ierr;
1339 
1340   PetscFunctionBegin;
1341   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1342   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1343   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1344   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1345   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 typedef Kokkos::TeamPolicy<>::member_type team_member;
1350 //
1351 // This factorization exploits block diagonal matrices with "Nf" (not used).
1352 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1353 //
1354 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1355 {
1356   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1357   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1358   IS                 isrow = b->row,isicol = b->icol;
1359   PetscErrorCode     ierr;
1360   const PetscInt     *r_h,*ic_h;
1361   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1362   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1363   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1364   PetscBool          row_identity,col_identity;
1365   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1366 
1367   PetscFunctionBegin;
1368   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1369   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1370   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1371   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1372   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1373   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1374   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1375   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1376   {
1377 #define KOKKOS_SHARED_LEVEL 1
1378     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1379     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1380     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1381     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1382     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1383     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1384     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1385     size_t flops_h = 0.0;
1386     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1387     Kokkos::View<size_t> d_flops_k ("flops");
1388     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1389     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1390     Kokkos::deep_copy (d_flops_k, h_flops_k);
1391     Kokkos::deep_copy (d_r_k, h_r_k);
1392     Kokkos::deep_copy (d_ic_k, h_ic_k);
1393     // Fill A --> fact
1394     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1395         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1396         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1397         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1398         // zero rows of B
1399         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1400             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1401             PetscScalar *baL = ba_d + bi_d[rowb];
1402             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1403             /* zero (unfactored row) */
1404             for (int j=0;j<nzbL;j++) baL[j] = 0;
1405             for (int j=0;j<nzbU;j++) baU[j] = 0;
1406           });
1407         // copy A into B
1408         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1409             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1410             const PetscScalar *av    = aa_d + ai_d[rowa];
1411             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1412             /* load in initial (unfactored row) */
1413             for (int j=0;j<nza;j++) {
1414               PetscInt    colb = ic[ajtmp[j]];
1415               PetscScalar vala = av[j];
1416               if (colb == rowb) {
1417                 *(ba_d + bdiag_d[rowb]) = vala;
1418               } else {
1419                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1420                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1421                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1422                 for (int j=0; j<nz ; j++) {
1423                   if (pbj[j] == colb) {
1424                     pba[j] = vala;
1425                     set++;
1426                     break;
1427                   }
1428                 }
1429                #if !defined(PETSC_HAVE_SYCL)
1430                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1431                #endif
1432               }
1433             }
1434           });
1435       });
1436     Kokkos::fence();
1437 
1438     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1439         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1440         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1441         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1442         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1443         const PetscInt  start = field*nloc, end = start + nloc;
1444         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1445         // A22 panel update for each row A(1,:) and col A(:,1)
1446         for (int ii=start; ii<end-1; ii++) {
1447           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1448           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1449           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1450           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1451           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1452               PetscInt kIdx = j*Ni + field_block_idx;
1453               if (kIdx >= nzUi) /* void */ ;
1454               else {
1455                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1456                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1457                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1458                 size_t         st_idx;
1459                 // find and do L(k,i) = A(:k,i) / A(i,i)
1460                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1461                 // get column, there has got to be a better way
1462                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1463                     if (pjL[j] == ii) {
1464                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1465                       idx = j; // output
1466                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1467                     }
1468                 }, st_idx);
1469                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1470 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1471                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1472 #endif
1473                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1474                 // U(i+1,:end)
1475                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1476                       PetscScalar Uij = baUi[uiIdx];
1477                       PetscInt    col = bjUi[uiIdx];
1478                       if (col==myk) {
1479                         // A_kk = A_kk - L_ki * U_ij(k)
1480                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1481                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1482                       } else {
1483                         PetscScalar    *start, *end, *pAkjv=NULL;
1484                         PetscInt       high, low;
1485                         const PetscInt *startj;
1486                         if (col<myk) { // L
1487                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1488                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1489                           start = pLki+1; // start at pLki+1, A22(myk,1)
1490                           startj= bj_d + bi_d[myk] + idx;
1491                           end   = ba_d + bi_d[myk+1];
1492                         } else {
1493                           PetscInt idx = bdiag_d[myk+1]+1;
1494                           start = ba_d + idx;
1495                           startj= bj_d + idx;
1496                           end   = ba_d + bdiag_d[myk];
1497                         }
1498                         // search for 'col', use bisection search - TODO
1499                         low  = 0;
1500                         high = (PetscInt)(end-start);
1501                         while (high-low > 5) {
1502                           int t = (low+high)/2;
1503                           if (startj[t] > col) high = t;
1504                           else                 low  = t;
1505                         }
1506                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1507                           if (startj[pAkjv-start] == col) break;
1508                         }
1509 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1510                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1511 #endif
1512                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1513                       }
1514                     });
1515               }
1516             });
1517           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1518           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1519         } /* endof for (i=0; i<n; i++) { */
1520         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1521       });
1522     Kokkos::fence();
1523     Kokkos::deep_copy (h_flops_k, d_flops_k);
1524     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1525     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1526         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1527         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1528         /* Invert diagonal for simpler triangular solves */
1529         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1530             int i = start + outer_index*Ni + lg_rank%Ni;
1531             if (i < end) {
1532               PetscScalar *pv = ba_d + bdiag_d[i];
1533               *pv = 1.0/(*pv);
1534             }
1535           });
1536       });
1537   }
1538   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1539   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1540   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1541 
1542   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1543   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1544   if (b->inode.size) {
1545     B->ops->solve = MatSolve_SeqAIJ_Inode;
1546   } else if (row_identity && col_identity) {
1547     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1548   } else {
1549     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1550   }
1551   B->offloadmask = PETSC_OFFLOAD_GPU;
1552   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1553   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1554   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1555   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1556   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1557   B->assembled              = PETSC_TRUE;
1558   B->preallocated           = PETSC_TRUE;
1559 
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1564 {
1565   PetscErrorCode   ierr;
1566 
1567   PetscFunctionBegin;
1568   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1569   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1574 {
1575   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1576 
1577   PetscFunctionBegin;
1578   if (!factors->sptrsv_symbolic_completed) {
1579     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1580     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1581     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1582   }
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 /* Check if we need to update factors etc for transpose solve */
1587 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1588 {
1589   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1590   MatColIdxType             n = A->rmap->n;
1591 
1592   PetscFunctionBegin;
1593   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1594     /* Update L^T and do sptrsv symbolic */
1595     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1596     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1597     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1598     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1599 
1600     KokkosKernels::Impl::transpose_matrix<
1601       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1602       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1603       MatRowMapKokkosView,DefaultExecutionSpace>(
1604         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1605         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1606 
1607     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1608       We have to sort the indices, until KK provides finer control options.
1609     */
1610     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1611       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1612         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1613 
1614     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1615 
1616     /* Update U^T and do sptrsv symbolic */
1617     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1618     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1619     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1620     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1621 
1622     KokkosKernels::Impl::transpose_matrix<
1623       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1624       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1625       MatRowMapKokkosView,DefaultExecutionSpace>(
1626         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1627         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1628 
1629     /* Sort indices. See comments above */
1630     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1631       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1632         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1633 
1634     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1635     factors->transpose_updated = PETSC_TRUE;
1636   }
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 /* Solve Ax = b, with A = LU */
1641 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1642 {
1643   PetscErrorCode                 ierr;
1644   ConstPetscScalarKokkosView     bv;
1645   PetscScalarKokkosView          xv;
1646   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1647 
1648   PetscFunctionBegin;
1649   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1650   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1651   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1652   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1653   /* Solve L tmpv = b */
1654   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1655   /* Solve Ux = tmpv */
1656   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1657   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1658   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1659   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1660   PetscFunctionReturn(0);
1661 }
1662 
1663 /* Solve A^T x = b, where A^T = U^T L^T */
1664 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1665 {
1666   PetscErrorCode                 ierr;
1667   ConstPetscScalarKokkosView     bv;
1668   PetscScalarKokkosView          xv;
1669   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1670 
1671   PetscFunctionBegin;
1672   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1673   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1674   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1675   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1676   /* Solve U^T tmpv = b */
1677   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1678 
1679   /* Solve L^T x = tmpv */
1680   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1681   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1682   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1683   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1684   PetscFunctionReturn(0);
1685 }
1686 
1687 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1688 {
1689   PetscErrorCode                 ierr;
1690   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1691   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1692   PetscInt                       fill_lev = info->levels;
1693 
1694   PetscFunctionBegin;
1695   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1696   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1697 
1698   auto a_d = aijkok->a_dual.view_device();
1699   auto i_d = aijkok->i_dual.view_device();
1700   auto j_d = aijkok->j_dual.view_device();
1701 
1702   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1703 
1704   B->assembled                       = PETSC_TRUE;
1705   B->preallocated                    = PETSC_TRUE;
1706   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1707   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1708   B->ops->matsolve                   = NULL;
1709   B->ops->matsolvetranspose          = NULL;
1710   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1711 
1712   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1713   factors->transpose_updated         = PETSC_FALSE;
1714   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1715   /* TODO: log flops, but how to know that? */
1716   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1721 {
1722   PetscErrorCode                 ierr;
1723   Mat_SeqAIJKokkos               *aijkok;
1724   Mat_SeqAIJ                     *b;
1725   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1726   PetscInt                       fill_lev = info->levels;
1727   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1728   PetscInt                       n = A->rmap->n;
1729 
1730   PetscFunctionBegin;
1731   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1732   /* Rebuild factors */
1733   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1734   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1735 
1736   /* Create a spiluk handle and then do symbolic factorization */
1737   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1738   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1739 
1740   auto spiluk_handle = factors->kh.get_spiluk_handle();
1741 
1742   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1743   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1744   Kokkos::realloc(factors->iU_d,n+1);
1745   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1746 
1747   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1748   auto i_d = aijkok->i_dual.view_device();
1749   auto j_d = aijkok->j_dual.view_device();
1750   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1751   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1752 
1753   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1754   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1755   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1756   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1757 
1758   /* TODO: add options to select sptrsv algorithms */
1759   /* Create sptrsv handles for L, U and their transpose */
1760  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1761   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1762  #else
1763   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1764  #endif
1765 
1766   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1767   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1768   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1769   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1770 
1771   /* Fill fields of the factor matrix B */
1772   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1773   b     = (Mat_SeqAIJ*)B->data;
1774   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1775   B->info.fill_ratio_given  = info->fill;
1776   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1777 
1778   B->offloadmask            = PETSC_OFFLOAD_GPU;
1779   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1780   PetscFunctionReturn(0);
1781 }
1782 
1783 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1784 {
1785   PetscErrorCode   ierr;
1786   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1787   const PetscInt   nrows   = A->rmap->n;
1788 
1789   PetscFunctionBegin;
1790   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1791   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1792   // move B data into Kokkos
1793   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1794   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1795   {
1796     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1797     if (!baijkok->diag_d.extent(0)) {
1798       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1799       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1800       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1801     }
1802   }
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1807 {
1808   PetscFunctionBegin;
1809   *type = MATSOLVERKOKKOS;
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1814 {
1815   PetscFunctionBegin;
1816   *type = MATSOLVERKOKKOSDEVICE;
1817   PetscFunctionReturn(0);
1818 }
1819 
1820 /*MC
1821   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1822   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1823 
1824   Level: beginner
1825 
1826 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1827 M*/
1828 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1829 {
1830   PetscErrorCode ierr;
1831   PetscInt       n = A->rmap->n;
1832 
1833   PetscFunctionBegin;
1834   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1835   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1836   (*B)->factortype = ftype;
1837   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1838   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1839 
1840   if (ftype == MAT_FACTOR_LU) {
1841     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1842     (*B)->canuseordering         = PETSC_TRUE;
1843     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1844   } else if (ftype == MAT_FACTOR_ILU) {
1845     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1846     (*B)->canuseordering         = PETSC_FALSE;
1847     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1848   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1849 
1850   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1851   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1856 {
1857   PetscErrorCode ierr;
1858   PetscInt       n = A->rmap->n;
1859 
1860   PetscFunctionBegin;
1861   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1862   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1863   (*B)->factortype = ftype;
1864   (*B)->canuseordering = PETSC_TRUE;
1865   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1866   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1867 
1868   if (ftype == MAT_FACTOR_LU) {
1869     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1870     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1871   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1872 
1873   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1874   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1879 {
1880   PetscErrorCode ierr;
1881 
1882   PetscFunctionBegin;
1883   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1884   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1885   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1886   PetscFunctionReturn(0);
1887 }
1888 
1889 /* Utility to print out a KokkosCsrMatrix for debugging */
1890 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1891 {
1892   PetscErrorCode    ierr;
1893   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1894   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1895   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1896   const PetscInt    *i = iv.data();
1897   const PetscInt    *j = jv.data();
1898   const PetscScalar *a = av.data();
1899   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1900 
1901   PetscFunctionBegin;
1902   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1903   for (PetscInt k=0; k<m; k++) {
1904     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1905     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1906       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1907     }
1908     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1909   }
1910   PetscFunctionReturn(0);
1911 }
1912