xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision e6befd462eead436f6b5cfbb7e320542053d4fca)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/aij.h>
19 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
20 
21 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
22 
23 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25    In the latter case, it is important to set a_dual's sync state correctly.
26  */
27 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
28 {
29   PetscErrorCode    ierr;
30   Mat_SeqAIJ        *aijseq;
31   Mat_SeqAIJKokkos  *aijkok;
32 
33   PetscFunctionBegin;
34   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
35   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36 
37   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39 
40   /* If aijkok does not exist, we just copy i, j to device.
41      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42      In both cases, we build a new aijkok structure.
43   */
44   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45     delete aijkok;
46     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47     A->spptr = aijkok;
48   }
49 
50   if (aijkok && aijkok->device_mat_d.data()) {
51     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52   }
53   PetscFunctionReturn(0);
54 }
55 
56 /* Sync CSR data to device if not yet */
57 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
58 {
59   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60 
61   PetscFunctionBegin;
62   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65   if (aijkok->a_dual.need_sync_device()) {
66     aijkok->a_dual.sync_device();
67     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
68     aijkok->hermitian_updated = PETSC_FALSE;
69   }
70   PetscFunctionReturn(0);
71 }
72 
73 /* Mark the CSR data on device as modified */
74 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
75 {
76   PetscErrorCode   ierr;
77   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
78 
79   PetscFunctionBegin;
80   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
81   aijkok->a_dual.clear_sync_state();
82   aijkok->a_dual.modify_device();
83   aijkok->transpose_updated = PETSC_FALSE;
84   aijkok->hermitian_updated = PETSC_FALSE;
85   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
86   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
87   PetscFunctionReturn(0);
88 }
89 
90 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91 {
92   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93 
94   PetscFunctionBegin;
95   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
96    /* We do not expect one needs factors on host  */
97   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99   aijkok->a_dual.sync_host();
100   PetscFunctionReturn(0);
101 }
102 
103 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104 {
105   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106 
107   PetscFunctionBegin;
108   if (aijkok) {
109     aijkok->a_dual.sync_host();
110     *array = aijkok->a_dual.view_host().data();
111   } else { /* Happens when calling MatSetValues on a newly created matrix */
112     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113   }
114   PetscFunctionReturn(0);
115 }
116 
117 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118 {
119   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120 
121   PetscFunctionBegin;
122   if (aijkok) aijkok->a_dual.modify_host();
123   PetscFunctionReturn(0);
124 }
125 
126 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129 
130   PetscFunctionBegin;
131   if (aijkok) {
132     aijkok->a_dual.sync_host();
133     *array = aijkok->a_dual.view_host().data();
134   } else {
135     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141 {
142   PetscFunctionBegin;
143   *array = NULL;
144   PetscFunctionReturn(0);
145 }
146 
147 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148 {
149   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150 
151   PetscFunctionBegin;
152   if (aijkok) {
153     *array = aijkok->a_dual.view_host().data();
154   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
155     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
156   }
157   PetscFunctionReturn(0);
158 }
159 
160 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161 {
162   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163 
164   PetscFunctionBegin;
165   if (aijkok) {
166     aijkok->a_dual.clear_sync_state();
167     aijkok->a_dual.modify_host();
168   }
169   PetscFunctionReturn(0);
170 }
171 
172 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174 {
175   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177 
178   PetscFunctionBegin;
179   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182   PetscFunctionReturn(0);
183 }
184 
185 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187 {
188   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189 
190   PetscFunctionBegin;
191   if (aijkok && aijkok->device_mat_d.data()) {
192     *d_mat = aijkok->device_mat_d.data();
193   } else {
194     PetscErrorCode   ierr;
195     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196     *d_mat  = NULL;
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 /* Generate the transpose on device and cache it internally */
202 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203 {
204   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205 
206   PetscFunctionBegin;
207   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210     CHKERRCXX(aijkok->a_dual.sync_device());
211     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
213     aijkok->transpose_updated = PETSC_TRUE;
214   }
215   *csrmatT = &aijkok->csrmatT;
216   PetscFunctionReturn(0);
217 }
218 
219 /* Generate the Hermitian on device and cache it internally */
220 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221 {
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
226   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
227     CHKERRCXX(aijkok->a_dual.sync_device());
228     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
230    #if defined(PETSC_USE_COMPLEX)
231     const auto& a = aijkok->csrmatH.values;
232     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
233    #endif
234     aijkok->hermitian_updated = PETSC_TRUE;
235   }
236   *csrmatH = &aijkok->csrmatH;
237   PetscFunctionReturn(0);
238 }
239 
240 /* y = A x */
241 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
242 {
243   PetscErrorCode                   ierr;
244   Mat_SeqAIJKokkos                 *aijkok;
245   ConstPetscScalarKokkosView       xv;
246   PetscScalarKokkosView            yv;
247 
248   PetscFunctionBegin;
249   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
250   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
251   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
252   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
253   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
254   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
255   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
256   ierr   = WaitForKokkos();CHKERRQ(ierr);
257   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
258   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
259   PetscFunctionReturn(0);
260 }
261 
262 /* y = A^T x */
263 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
264 {
265   PetscErrorCode                   ierr;
266   Mat_SeqAIJKokkos                 *aijkok;
267   const char                       *mode;
268   ConstPetscScalarKokkosView       xv;
269   PetscScalarKokkosView            yv;
270   KokkosCsrMatrix                  *csrmat;
271 
272   PetscFunctionBegin;
273   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
274   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
275   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
276   if (A->form_explicit_transpose) {
277     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
278     mode = "N";
279   } else {
280     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
281     csrmat = &aijkok->csrmat;
282     mode = "T";
283   }
284   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
285   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
286   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
287   ierr = WaitForKokkos();CHKERRQ(ierr);
288   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 /* y = A^H x */
293 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
294 {
295   PetscErrorCode                   ierr;
296   Mat_SeqAIJKokkos                 *aijkok;
297   const char                       *mode;
298   ConstPetscScalarKokkosView       xv;
299   PetscScalarKokkosView            yv;
300   KokkosCsrMatrix                  *csrmat;
301 
302   PetscFunctionBegin;
303   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
304   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
305   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
306   if (A->form_explicit_transpose) {
307     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
308     mode = "N";
309   } else {
310     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
311     csrmat = &aijkok->csrmat;
312     mode = "C";
313   }
314   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
315   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
316   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
317   ierr = WaitForKokkos();CHKERRQ(ierr);
318   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
319   PetscFunctionReturn(0);
320 }
321 
322 /* z = A x + y */
323 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
324 {
325   PetscErrorCode                   ierr;
326   Mat_SeqAIJKokkos                 *aijkok;
327   ConstPetscScalarKokkosView       xv,yv;
328   PetscScalarKokkosView            zv;
329 
330   PetscFunctionBegin;
331   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
332   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
333   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
334   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
335   if (zz != yy) Kokkos::deep_copy(zv,yv);
336   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
337   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
338   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
341   ierr = WaitForKokkos();CHKERRQ(ierr);
342   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 /* z = A^T x + y */
347 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
348 {
349   PetscErrorCode                   ierr;
350   Mat_SeqAIJKokkos                 *aijkok;
351   const char                       *mode;
352   ConstPetscScalarKokkosView       xv,yv;
353   PetscScalarKokkosView            zv;
354   KokkosCsrMatrix                  *csrmat;
355 
356   PetscFunctionBegin;
357   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
358   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
359   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
360   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
361   if (zz != yy) Kokkos::deep_copy(zv,yv);
362   if (A->form_explicit_transpose) {
363     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
364     mode = "N";
365   } else {
366     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
367     csrmat = &aijkok->csrmat;
368     mode = "T";
369   }
370   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
371   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
372   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
373   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
374   ierr = WaitForKokkos();CHKERRQ(ierr);
375   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
376   PetscFunctionReturn(0);
377 }
378 
379 /* z = A^H x + y */
380 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
381 {
382   PetscErrorCode                   ierr;
383   Mat_SeqAIJKokkos                 *aijkok;
384   const char                       *mode;
385   ConstPetscScalarKokkosView       xv,yv;
386   PetscScalarKokkosView            zv;
387   KokkosCsrMatrix                  *csrmat;
388 
389   PetscFunctionBegin;
390   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
391   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
392   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
393   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
394   if (zz != yy) Kokkos::deep_copy(zv,yv);
395   if (A->form_explicit_transpose) {
396     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
397     mode = "N";
398   } else {
399     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
400     csrmat = &aijkok->csrmat;
401     mode = "C";
402   }
403   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
404   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
405   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
406   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
407   ierr = WaitForKokkos();CHKERRQ(ierr);
408   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
409   PetscFunctionReturn(0);
410 }
411 
412 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
413 {
414   PetscErrorCode            ierr;
415   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
416 
417   PetscFunctionBegin;
418   switch (op) {
419     case MAT_FORM_EXPLICIT_TRANSPOSE:
420       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
421       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
422       A->form_explicit_transpose = flg;
423       break;
424     default:
425       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
426       break;
427   }
428   PetscFunctionReturn(0);
429 }
430 
431 /* Depending on reuse, either build a new mat, or use the existing mat */
432 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
433 {
434   PetscErrorCode   ierr;
435   Mat_SeqAIJ       *aseq;
436 
437   PetscFunctionBegin;
438   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
439   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
440     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
441   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
442     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
443   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
444     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
445     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
446     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
447     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
448     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
449     aseq = static_cast<Mat_SeqAIJ*>(A->data);
450     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
451       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
452       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
453     }
454   }
455   PetscFunctionReturn(0);
456 }
457 
458 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
459    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
460  */
461 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
462 {
463   PetscErrorCode        ierr;
464   Mat_SeqAIJ            *bseq;
465   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
466   Mat                   mat;
467 
468   PetscFunctionBegin;
469   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
470   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
471   mat  = *B;
472   if (A->assembled) {
473     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
474     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
475     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
476     /* Now copy values to B if needed */
477     if (dupOption == MAT_COPY_VALUES) {
478       if (akok->a_dual.need_sync_device()) {
479         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
480         bkok->a_dual.modify_host();
481       } else { /* If device has the latest data, we only copy data on device */
482         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
483         bkok->a_dual.modify_device();
484       }
485     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
486       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
487       bkok->a_dual.modify_host();
488     }
489     mat->spptr = bkok;
490   }
491 
492   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
493   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
494   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
495   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
496   PetscFunctionReturn(0);
497 }
498 
499 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
500 {
501   PetscErrorCode    ierr;
502   Mat               At;
503   KokkosCsrMatrix   *internT,*csrmatT;
504   Mat_SeqAIJKokkos  *atkok,*bkok;
505 
506   PetscFunctionBegin;
507   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
508   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
509     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
510     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
511     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
512     if (reuse == MAT_INITIAL_MATRIX) *B = At;
513     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
514   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
515     if ((*B)->assembled) {
516       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
517       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
518       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
519     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
520       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
521       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
522       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
523       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
524       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
525     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
526   }
527   PetscFunctionReturn(0);
528 }
529 
530 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
531 {
532   PetscErrorCode             ierr;
533   Mat_SeqAIJKokkos           *aijkok;
534 
535   PetscFunctionBegin;
536   if (A->factortype == MAT_FACTOR_NONE) {
537     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
538     if (aijkok) {
539       if (aijkok->device_mat_d.data()) {
540         delete aijkok->colmap_d;
541         delete aijkok->i_uncompressed_d;
542       }
543       if (aijkok->diag_d) delete aijkok->diag_d;
544     }
545     delete aijkok;
546   } else {
547     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
548   }
549   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
552   A->spptr = NULL;
553   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
558 {
559   PetscErrorCode ierr;
560 
561   PetscFunctionBegin;
562   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
563   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
564   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
565   PetscFunctionReturn(0);
566 }
567 
568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
570 {
571   PetscErrorCode               ierr;
572   Mat_SeqAIJ                   *a,*b;
573   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
574   MatScalarKokkosView          aa,ba,ca;
575   MatRowMapKokkosView          ai,bi,ci;
576   MatColIdxKokkosView          aj,bj,cj;
577   PetscInt                     m,n,nnz,aN;
578 
579   PetscFunctionBegin;
580   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
581   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
582   PetscValidPointer(C,4);
583   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
584   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
585   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
586   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
587 
588   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
589   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
590   a    = static_cast<Mat_SeqAIJ*>(A->data);
591   b    = static_cast<Mat_SeqAIJ*>(B->data);
592   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
593   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
594   aa   = akok->a_dual.view_device();
595   ai   = akok->i_dual.view_device();
596   ba   = bkok->a_dual.view_device();
597   bi   = bkok->i_dual.view_device();
598   m    = A->rmap->n; /* M, N and nnz of C */
599   n    = A->cmap->n + B->cmap->n;
600   nnz  = a->nz + b->nz;
601   aN   = A->cmap->n; /* N of A */
602   if (reuse == MAT_INITIAL_MATRIX) {
603     aj = akok->j_dual.view_device();
604     bj = bkok->j_dual.view_device();
605     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
606     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
607     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
608     ca = ca_dual.view_device();
609     ci = ci_dual.view_device();
610     cj = cj_dual.view_device();
611 
612     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
613     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
614       PetscInt i = t.league_rank(); /* row i */
615       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
616 
617       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618         ci(i) = coffset;
619         if (i == m-1) ci(m) = ai(m) + bi(m);
620       });
621 
622       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
623         if (k < alen) {
624           ca(coffset+k) = aa(ai(i)+k);
625           cj(coffset+k) = aj(ai(i)+k);
626         } else {
627           ca(coffset+k) = ba(bi(i)+k-alen);
628           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
629         }
630       });
631     });
632     ca_dual.modify_device();
633     ci_dual.modify_device();
634     cj_dual.modify_device();
635     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
636     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
637   } else if (reuse == MAT_REUSE_MATRIX) {
638     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
639     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
640     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
641     ca   = ckok->a_dual.view_device();
642     ci   = ckok->i_dual.view_device();
643 
644     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
645       PetscInt i = t.league_rank(); /* row i */
646       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
647       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
648         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
649         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
650       });
651     });
652     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
653   }
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
658 {
659   PetscFunctionBegin;
660   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
661   PetscFunctionReturn(0);
662 }
663 
664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665 {
666   PetscErrorCode                 ierr;
667   Mat_Product                    *product = C->product;
668   Mat                            A,B;
669   bool                           transA,transB; /* use bool, since KK needs this type */
670   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
671   Mat_SeqAIJ                     *c;
672   MatProductData_SeqAIJKokkos    *pdata;
673   KokkosCsrMatrix                *csrmatA,*csrmatB;
674 
675   PetscFunctionBegin;
676   MatCheckProduct(C,1);
677   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
678   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
679 
680   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
681     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
682     PetscFunctionReturn(0);
683   }
684 
685   switch (product->type) {
686     case MATPRODUCT_AB:  transA = false; transB = false; break;
687     case MATPRODUCT_AtB: transA = true;  transB = false; break;
688     case MATPRODUCT_ABt: transA = false; transB = true;  break;
689     default:
690       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
691   }
692 
693   A     = product->A;
694   B     = product->B;
695   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
696   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
697   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
698   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
699   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
700 
701   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
702 
703   csrmatA = &akok->csrmat;
704   csrmatB = &bkok->csrmat;
705 
706   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
707   if (transA) {
708     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
709     transA = false;
710   }
711 
712   if (transB) {
713     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
714     transB = false;
715   }
716 
717   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
718   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
719   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
720   /* shorter version of MatAssemblyEnd_SeqAIJ */
721   c = (Mat_SeqAIJ*)C->data;
722   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
723   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
724   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
725   c->reallocs         = 0;
726   C->info.mallocs     = 0;
727   C->info.nz_unneeded = 0;
728   C->assembled        = C->was_assembled = PETSC_TRUE;
729   C->num_ass++;
730   PetscFunctionReturn(0);
731 }
732 
733 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
734 {
735   PetscErrorCode                 ierr;
736   Mat_Product                    *product = C->product;
737   MatProductType                 ptype;
738   Mat                            A,B;
739   bool                           transA,transB;
740   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
741   MatProductData_SeqAIJKokkos    *pdata;
742   MPI_Comm                       comm;
743   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
744 
745   PetscFunctionBegin;
746   MatCheckProduct(C,1);
747   ierr = PetscObjectGetComm((PetscObject)C,&comm);
748   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
749   A       = product->A;
750   B       = product->B;
751   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
752   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
753   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
754   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
755   csrmatA = &akok->csrmat;
756   csrmatB = &bkok->csrmat;
757 
758   ptype   = product->type;
759   switch (ptype) {
760     case MATPRODUCT_AB:  transA = false; transB = false; break;
761     case MATPRODUCT_AtB: transA = true;  transB = false; break;
762     case MATPRODUCT_ABt: transA = false; transB = true;  break;
763     default:
764       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
765   }
766 
767   product->data = pdata = new MatProductData_SeqAIJKokkos();
768   pdata->kh.set_team_work_size(16);
769   pdata->kh.set_dynamic_scheduling(true);
770   pdata->reusesym = product->api_user;
771 
772   /* TODO: add command line options to select spgemm algorithms */
773   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
774   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
775      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
776    */
777   pdata->kh.create_spgemm_handle(spgemm_alg);
778 
779   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
780   if (transA) {
781     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
782     transA = false;
783   }
784 
785   if (transB) {
786     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
787     transB = false;
788   }
789 
790   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
792     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
793     calling new Mat_SeqAIJKokkos().
794     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
795   */
796   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
797   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
798 
799   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
800   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
801   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
802   PetscFunctionReturn(0);
803 }
804 
805 /* handles sparse matrix matrix ops */
806 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
807 {
808   PetscErrorCode ierr;
809   Mat_Product    *product = mat->product;
810   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
811 
812   PetscFunctionBegin;
813   MatCheckProduct(mat,1);
814   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
815   if (product->type == MATPRODUCT_ABC) {
816     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
817   }
818   if (Biskok && Ciskok) {
819     switch (product->type) {
820     case MATPRODUCT_AB:
821     case MATPRODUCT_AtB:
822     case MATPRODUCT_ABt:
823       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
824       break;
825     case MATPRODUCT_PtAP:
826     case MATPRODUCT_RARt:
827     case MATPRODUCT_ABC:
828       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
829       break;
830     default:
831       break;
832     }
833   } else { /* fallback for AIJ */
834     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
835   }
836   PetscFunctionReturn(0);
837 }
838 
839 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
840 {
841   PetscErrorCode   ierr;
842   Mat_SeqAIJKokkos *aijkok;
843 
844   PetscFunctionBegin;
845   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
846   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
847   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
848   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
849   ierr = WaitForKokkos();CHKERRQ(ierr);
850   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
851   PetscFunctionReturn(0);
852 }
853 
854 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
855 {
856   PetscErrorCode   ierr;
857   Mat_SeqAIJKokkos *aijkok;
858 
859   PetscFunctionBegin;
860   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
861   if (aijkok) { /* Only zero the device if data is already there */
862     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
863     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
864   } else { /* Might be preallocated but not assembled */
865     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
866   }
867   PetscFunctionReturn(0);
868 }
869 
870 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
871 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
872 {
873   PetscErrorCode     ierr;
874   Mat_SeqAIJKokkos   *aijkok;
875 
876   PetscFunctionBegin;
877   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
878   PetscValidPointer(kv,2);
879   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
880   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
881   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
882   *kv    = aijkok->a_dual.view_device();
883   PetscFunctionReturn(0);
884 }
885 
886 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
887 {
888   PetscFunctionBegin;
889   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890   PetscValidPointer(kv,2);
891   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
896 {
897   PetscErrorCode     ierr;
898   Mat_SeqAIJKokkos   *aijkok;
899 
900   PetscFunctionBegin;
901   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902   PetscValidPointer(kv,2);
903   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
905   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
906   *kv    = aijkok->a_dual.view_device();
907   PetscFunctionReturn(0);
908 }
909 
910 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
911 {
912   PetscErrorCode     ierr;
913 
914   PetscFunctionBegin;
915   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
916   PetscValidPointer(kv,2);
917   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
918   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
922 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
923 {
924   Mat_SeqAIJKokkos   *aijkok;
925 
926   PetscFunctionBegin;
927   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928   PetscValidPointer(kv,2);
929   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931   *kv    = aijkok->a_dual.view_device();
932   PetscFunctionReturn(0);
933 }
934 
935 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
936 {
937   PetscErrorCode     ierr;
938 
939   PetscFunctionBegin;
940   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
941   PetscValidPointer(kv,2);
942   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
943   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
944   PetscFunctionReturn(0);
945 }
946 
947 /* Computes Y += alpha X */
948 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
949 {
950   PetscErrorCode             ierr;
951   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
952   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
953   ConstMatScalarKokkosView   Xa;
954   MatScalarKokkosView        Ya;
955 
956   PetscFunctionBegin;
957   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
958   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
959   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
960   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
961 
962   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
963     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
964     PetscBool e;
965     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
966     if (e) {
967       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
968       if (e) pattern = SAME_NONZERO_PATTERN;
969     }
970   }
971 
972   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
973     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
974     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
975     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
976   */
977   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
978   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
979   Xa   = xkok->a_dual.view_device();
980   Ya   = ykok->a_dual.view_device();
981 
982   if (pattern == SAME_NONZERO_PATTERN) {
983     KokkosBlas::axpy(alpha,Xa,Ya);
984     ierr = MatSeqAIJKokkosModifyDevice(Y);
985   } else if (pattern == SUBSET_NONZERO_PATTERN) {
986     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
987     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
988 
989     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
990       PetscInt i = t.league_rank(); /* row i */
991       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
992         PetscInt p,q = Yi(i);
993         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
994           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
995           if (Xj(p) == Yj(q)) { /* Found it */
996             Ya(q) += alpha * Xa(p);
997             q++;
998           } else {
999             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1000                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1001             */
1002             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1003           }
1004         }
1005       });
1006     });
1007     ierr = MatSeqAIJKokkosModifyDevice(Y);
1008   } else { /* different nonzero patterns */
1009     Mat             Z;
1010     KokkosCsrMatrix zcsr;
1011     KernelHandle    kh;
1012     kh.create_spadd_handle(false);
1013     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1014     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1015     zkok = new Mat_SeqAIJKokkos(zcsr);
1016     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1017     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1018     kh.destroy_spadd_handle();
1019   }
1020 
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1025 {
1026   PetscErrorCode            ierr;
1027   MPI_Comm                  comm;
1028   PetscInt                  *i,*j,*perm,*jmap;
1029   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
1030   PetscBool                 has_repeats = PETSC_FALSE;
1031   PetscInt                  *Ai,*Aj;
1032   PetscScalar               *Aa;
1033   Mat_SeqAIJKokkos          *akok;
1034   Mat_SeqAIJ                *aseq;
1035   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
1036   Mat                       newmat;
1037 
1038   PetscFunctionBegin;
1039   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1040   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1041   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
1042   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1043   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
1044   perm = perm_h.data();
1045   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
1046     if (j[k] < 0) i[k] = -1;
1047     perm[k] = k;
1048   }
1049 
1050   /* Sort by row */
1051   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
1052   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
1053   nneg = k;
1054   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
1055   nnz  = 0; /* Total number of unique nonzeros to be counted */
1056   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
1057   jmap++; /* Inc jmap by 1 for convinience */
1058 
1059   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
1060   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
1061 
1062   /* In each row, sort by column, then unique column indices to get row length */
1063   Ai++; /* Inc by 1 for convinience */
1064   q = 0; /* q-th unique nonzero, with q starting from 0 */
1065   while (k<n) {
1066     row   = i[k];
1067     start = k; /* [start,end) indices for this row */
1068     while (k<n && i[k] == row) k++;
1069     end   = k;
1070     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
1071     /* Find number of unique col entries in this row */
1072     Aj[q]   = j[start]; /* Log the first nonzero in this row */
1073     jmap[q] = 1; /* Number of repeats of this nozero entry */
1074     Ai[row] = 1;
1075     nnz++;
1076 
1077     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
1078       if (j[p] != j[p-1]) { /* Meet a new nonzero */
1079         q++;
1080         jmap[q] = 1;
1081         Aj[q]   = j[p];
1082         Ai[row]++;
1083         nnz++;
1084       } else {
1085         jmap[q]++;
1086         has_repeats = PETSC_TRUE;
1087       }
1088     }
1089     q++; /* Move to next row and thus next unique nonzero */
1090   }
1091   ierr = PetscFree2(i,j);CHKERRQ(ierr);
1092 
1093   Ai--; /* Back to the beginning of Ai[] */
1094   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
1095   jmap--; /* Back to the beginning of jmap[] */
1096   jmap[0] = 0;
1097   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
1098     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
1099     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
1100   }
1101 
1102   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
1103     PetscInt *Aj_new;
1104     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
1105     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
1106     ierr = PetscFree(Aj);CHKERRQ(ierr);
1107     Aj = Aj_new;
1108   }
1109 
1110   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
1111   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
1112   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
1113   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
1114   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
1115 
1116   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1117   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1118   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1119 
1120   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
1121     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
1122     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
1123     perm_h = newperm_h;
1124   }
1125 
1126   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1127   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
1128   PetscFunctionReturn(0);
1129 }
1130 
1131 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1132 {
1133   PetscErrorCode              ierr;
1134   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1135   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1136   PetscInt                    nz = aseq->nz;
1137   const MatRowMapKokkosView&  jmap = akok->jmap_d;
1138   const MatRowMapKokkosView&  perm = akok->perm_d;
1139   MatScalarKokkosView         Aa;
1140   ConstMatScalarKokkosView    kv;
1141   PetscMemType                memtype;
1142 
1143   PetscFunctionBegin;
1144   if (!v) { /* NULL v means an all zero array */
1145     ierr = MatZeroEntries(A);CHKERRQ(ierr);
1146     PetscFunctionReturn(0);
1147   }
1148 
1149   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1150   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1151     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
1152   } else {
1153     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
1154   }
1155 
1156   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1157   if (imode == INSERT_VALUES) {
1158     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1159   }
1160 
1161   if (akok->coo_has_repeats) {
1162     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
1163       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1164     });
1165   } else {
1166     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
1167   }
1168   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1169   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1170   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   PetscErrorCode ierr;
1177 
1178   PetscFunctionBegin;
1179   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1180   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1181   B->offloadmask = PETSC_OFFLOAD_CPU;
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1186 {
1187   PetscErrorCode     ierr;
1188   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1189 
1190   PetscFunctionBegin;
1191   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1192   A->boundtocpu  = PETSC_FALSE;
1193 
1194   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1195   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1196   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1197   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1198   A->ops->scale                     = MatScale_SeqAIJKokkos;
1199   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1200   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1201   A->ops->mult                      = MatMult_SeqAIJKokkos;
1202   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1203   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1204   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1205   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1206   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1207   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1208   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1209   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1210   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1211   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1212   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1213   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1214   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1215   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1216 
1217   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1218   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1219   PetscFunctionReturn(0);
1220 }
1221 
1222 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1223 {
1224   PetscErrorCode ierr;
1225   Mat_SeqAIJ     *aseq;
1226   PetscInt       i,m,n;
1227 
1228   PetscFunctionBegin;
1229   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1230 
1231   m    = akok->nrows();
1232   n    = akok->ncols();
1233   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1234   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1235 
1236   /* Set up data structures of A as a MATSEQAIJ */
1237   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1238   aseq = (Mat_SeqAIJ*)(A)->data;
1239 
1240   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1241   akok->j_dual.sync_host();
1242 
1243   aseq->i            = akok->i_host_data();
1244   aseq->j            = akok->j_host_data();
1245   aseq->a            = akok->a_host_data();
1246   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1247   aseq->singlemalloc = PETSC_FALSE;
1248   aseq->free_a       = PETSC_FALSE;
1249   aseq->free_ij      = PETSC_FALSE;
1250   aseq->nz           = akok->nnz();
1251   aseq->maxnz        = aseq->nz;
1252 
1253   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1254   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1255   for (i=0; i<m; i++) {
1256     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1257   }
1258 
1259   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1260   akok->nonzerostate = A->nonzerostate;
1261   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1262   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1263   A->spptr = akok;
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1268 
1269    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1270  */
1271 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1272 {
1273   PetscErrorCode ierr;
1274 
1275   PetscFunctionBegin;
1276   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1277   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /* --------------------------------------------------------------------------------*/
1282 /*@C
1283    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1284    (the default parallel PETSc format). This matrix will ultimately be handled by
1285    Kokkos for calculations. For good matrix
1286    assembly performance the user should preallocate the matrix storage by setting
1287    the parameter nz (or the array nnz).  By setting these parameters accurately,
1288    performance during matrix assembly can be increased by more than a factor of 50.
1289 
1290    Collective
1291 
1292    Input Parameters:
1293 +  comm - MPI communicator, set to PETSC_COMM_SELF
1294 .  m - number of rows
1295 .  n - number of columns
1296 .  nz - number of nonzeros per row (same for all rows)
1297 -  nnz - array containing the number of nonzeros in the various rows
1298          (possibly different for each row) or NULL
1299 
1300    Output Parameter:
1301 .  A - the matrix
1302 
1303    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1304    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1305    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1306 
1307    Notes:
1308    If nnz is given then nz is ignored
1309 
1310    The AIJ format (also called the Yale sparse matrix format or
1311    compressed row storage), is fully compatible with standard Fortran 77
1312    storage.  That is, the stored row and column indices can begin at
1313    either one (as in Fortran) or zero.  See the users' manual for details.
1314 
1315    Specify the preallocated storage with either nz or nnz (not both).
1316    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1317    allocation.  For large problems you MUST preallocate memory or you
1318    will get TERRIBLE performance, see the users' manual chapter on matrices.
1319 
1320    By default, this format uses inodes (identical nodes) when possible, to
1321    improve numerical efficiency of matrix-vector products and solves. We
1322    search for consecutive rows with the same nonzero structure, thereby
1323    reusing matrix information to achieve increased efficiency.
1324 
1325    Level: intermediate
1326 
1327 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1328 @*/
1329 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1330 {
1331   PetscErrorCode ierr;
1332 
1333   PetscFunctionBegin;
1334   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1335   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1336   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1337   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1338   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 typedef Kokkos::TeamPolicy<>::member_type team_member;
1343 //
1344 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1345 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1346 //
1347 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1348 {
1349   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1350   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1351   IS                 isrow = b->row,isicol = b->icol;
1352   PetscErrorCode     ierr;
1353   const PetscInt     *r_h,*ic_h;
1354   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1355   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1356   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1357   PetscBool          row_identity,col_identity;
1358   PetscInt           nc, Nf, nVec=32; // should be a parameter
1359   PetscContainer     container;
1360 
1361   PetscFunctionBegin;
1362   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1363   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1364   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1365   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1366   if (container) {
1367     PetscInt *pNf=NULL, nv;
1368     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1369     Nf = (*pNf)%1000;
1370     nv = (*pNf)/1000;
1371     if (nv>0) nVec = nv;
1372   } else Nf = 1;
1373   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
1374   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1375   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1376   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1377 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1378   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1379 #endif
1380   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1381   {
1382 #define KOKKOS_SHARED_LEVEL 1
1383     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1384     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1385     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1386     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1387     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1388     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1389     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1390     size_t flops_h = 0.0;
1391     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1392     Kokkos::View<size_t> d_flops_k ("flops");
1393     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1394     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1395     Kokkos::deep_copy (d_flops_k, h_flops_k);
1396     Kokkos::deep_copy (d_r_k, h_r_k);
1397     Kokkos::deep_copy (d_ic_k, h_ic_k);
1398     // Fill A --> fact
1399     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1400         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1401         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1402         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1403         // zero rows of B
1404         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1405             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1406             PetscScalar *baL = ba_d + bi_d[rowb];
1407             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1408             /* zero (unfactored row) */
1409             for (int j=0;j<nzbL;j++) baL[j] = 0;
1410             for (int j=0;j<nzbU;j++) baU[j] = 0;
1411           });
1412         // copy A into B
1413         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1414             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1415             const PetscScalar *av    = aa_d + ai_d[rowa];
1416             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1417             /* load in initial (unfactored row) */
1418             for (int j=0;j<nza;j++) {
1419               PetscInt    colb = ic[ajtmp[j]];
1420               PetscScalar vala = av[j];
1421               if (colb == rowb) {
1422                 *(ba_d + bdiag_d[rowb]) = vala;
1423               } else {
1424                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1425                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1426                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1427                 for (int j=0; j<nz ; j++) {
1428                   if (pbj[j] == colb) {
1429                     pba[j] = vala;
1430                     set++;
1431                     break;
1432                   }
1433                 }
1434                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1435               }
1436             }
1437           });
1438       });
1439     Kokkos::fence();
1440 
1441     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1442         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1443         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1444         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1445         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1446         const PetscInt  start = field*nloc, end = start + nloc;
1447         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1448         // A22 panel update for each row A(1,:) and col A(:,1)
1449         for (int ii=start; ii<end-1; ii++) {
1450           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1451           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1452           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1453           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1454           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1455               PetscInt kIdx = j*Ni + field_block_idx;
1456               if (kIdx >= nzUi) /* void */ ;
1457               else {
1458                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1459                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1460                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1461                 size_t         st_idx;
1462                 // find and do L(k,i) = A(:k,i) / A(i,i)
1463                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1464                 // get column, there has got to be a better way
1465                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1466                     if (pjL[j] == ii) {
1467                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1468                       idx = j; // output
1469                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1470                     }
1471                 }, st_idx);
1472                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1473 #if defined(PETSC_USE_DEBUG)
1474                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1475 #endif
1476                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1477                 // U(i+1,:end)
1478                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1479                       PetscScalar Uij = baUi[uiIdx];
1480                       PetscInt    col = bjUi[uiIdx];
1481                       if (col==myk) {
1482                         // A_kk = A_kk - L_ki * U_ij(k)
1483                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1484                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1485                       } else {
1486                         PetscScalar    *start, *end, *pAkjv=NULL;
1487                         PetscInt       high, low;
1488                         const PetscInt *startj;
1489                         if (col<myk) { // L
1490                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1491                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1492                           start = pLki+1; // start at pLki+1, A22(myk,1)
1493                           startj= bj_d + bi_d[myk] + idx;
1494                           end   = ba_d + bi_d[myk+1];
1495                         } else {
1496                           PetscInt idx = bdiag_d[myk+1]+1;
1497                           start = ba_d + idx;
1498                           startj= bj_d + idx;
1499                           end   = ba_d + bdiag_d[myk];
1500                         }
1501                         // search for 'col', use bisection search - TODO
1502                         low  = 0;
1503                         high = (PetscInt)(end-start);
1504                         while (high-low > 5) {
1505                           int t = (low+high)/2;
1506                           if (startj[t] > col) high = t;
1507                           else                 low  = t;
1508                         }
1509                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1510                           if (startj[pAkjv-start] == col) break;
1511                         }
1512 #if defined(PETSC_USE_DEBUG)
1513                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1514 #endif
1515                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1516                       }
1517                     });
1518               }
1519             });
1520           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1521           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1522         } /* endof for (i=0; i<n; i++) { */
1523         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1524       });
1525     Kokkos::fence();
1526     Kokkos::deep_copy (h_flops_k, d_flops_k);
1527 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1528     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1529 #elif  defined(PETSC_USE_LOG)
1530     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1531 #endif
1532     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1533         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1534         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1535         /* Invert diagonal for simpler triangular solves */
1536         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1537             int i = start + outer_index*Ni + lg_rank%Ni;
1538             if (i < end) {
1539               PetscScalar *pv = ba_d + bdiag_d[i];
1540               *pv = 1.0/(*pv);
1541             }
1542           });
1543       });
1544   }
1545 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1546   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1547 #endif
1548   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1549   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1550 
1551   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1552   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1553   if (b->inode.size) {
1554     B->ops->solve = MatSolve_SeqAIJ_Inode;
1555   } else if (row_identity && col_identity) {
1556     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1557   } else {
1558     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1559   }
1560   B->offloadmask = PETSC_OFFLOAD_GPU;
1561   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1562   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1563   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1564   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1565   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1566   B->assembled              = PETSC_TRUE;
1567   B->preallocated           = PETSC_TRUE;
1568 
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1573 {
1574   PetscErrorCode   ierr;
1575 
1576   PetscFunctionBegin;
1577   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1578   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1583 {
1584   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1585 
1586   PetscFunctionBegin;
1587   if (!factors->sptrsv_symbolic_completed) {
1588     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1589     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1590     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1591   }
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 /* Check if we need to update factors etc for transpose solve */
1596 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1597 {
1598   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1599   MatColIdxType             n = A->rmap->n;
1600 
1601   PetscFunctionBegin;
1602   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1603     /* Update L^T and do sptrsv symbolic */
1604     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1605     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1606     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1607     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1608 
1609     KokkosKernels::Impl::transpose_matrix<
1610       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1611       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1612       MatRowMapKokkosView,DefaultExecutionSpace>(
1613         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1614         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1615 
1616     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1617       We have to sort the indices, until KK provides finer control options.
1618     */
1619     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1620       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1621         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1622 
1623     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1624 
1625     /* Update U^T and do sptrsv symbolic */
1626     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1627     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1628     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1629     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1630 
1631     KokkosKernels::Impl::transpose_matrix<
1632       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1633       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1634       MatRowMapKokkosView,DefaultExecutionSpace>(
1635         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1636         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1637 
1638     /* Sort indices. See comments above */
1639     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1640       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1641         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1642 
1643     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1644     factors->transpose_updated = PETSC_TRUE;
1645   }
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 /* Solve Ax = b, with A = LU */
1650 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1651 {
1652   PetscErrorCode                 ierr;
1653   ConstPetscScalarKokkosView     bv;
1654   PetscScalarKokkosView          xv;
1655   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1656 
1657   PetscFunctionBegin;
1658   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1659   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1660   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1661   /* Solve L tmpv = b */
1662   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1663   /* Solve Ux = tmpv */
1664   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1665   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1666   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 /* Solve A^T x = b, where A^T = U^T L^T */
1671 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1672 {
1673   PetscErrorCode                 ierr;
1674   ConstPetscScalarKokkosView     bv;
1675   PetscScalarKokkosView          xv;
1676   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1677 
1678   PetscFunctionBegin;
1679   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1680   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1681   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1682   /* Solve U^T tmpv = b */
1683   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1684 
1685   /* Solve L^T x = tmpv */
1686   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1687   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1688   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1693 {
1694   PetscErrorCode                 ierr;
1695   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1696   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1697   PetscInt                       fill_lev = info->levels;
1698 
1699   PetscFunctionBegin;
1700   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1701 
1702   auto a_d = aijkok->a_dual.view_device();
1703   auto i_d = aijkok->i_dual.view_device();
1704   auto j_d = aijkok->j_dual.view_device();
1705 
1706   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1707 
1708   B->assembled                       = PETSC_TRUE;
1709   B->preallocated                    = PETSC_TRUE;
1710   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1711   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1712   B->ops->matsolve                   = NULL;
1713   B->ops->matsolvetranspose          = NULL;
1714   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1715 
1716   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1717   factors->transpose_updated         = PETSC_FALSE;
1718   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1719   PetscFunctionReturn(0);
1720 }
1721 
1722 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1723 {
1724   PetscErrorCode                 ierr;
1725   Mat_SeqAIJKokkos               *aijkok;
1726   Mat_SeqAIJ                     *b;
1727   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1728   PetscInt                       fill_lev = info->levels;
1729   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1730   PetscInt                       n = A->rmap->n;
1731 
1732   PetscFunctionBegin;
1733   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1734   /* Rebuild factors */
1735   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1736   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1737 
1738   /* Create a spiluk handle and then do symbolic factorization */
1739   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1740   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1741 
1742   auto spiluk_handle = factors->kh.get_spiluk_handle();
1743 
1744   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1745   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1746   Kokkos::realloc(factors->iU_d,n+1);
1747   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1748 
1749   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1750   auto i_d = aijkok->i_dual.view_device();
1751   auto j_d = aijkok->j_dual.view_device();
1752   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1753   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1754 
1755   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1756   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1757   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1758   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1759 
1760   /* TODO: add options to select sptrsv algorithms */
1761   /* Create sptrsv handles for L, U and their transpose */
1762  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1763   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1764  #else
1765   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1766  #endif
1767 
1768   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1769   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1770   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1771   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1772 
1773   /* Fill fields of the factor matrix B */
1774   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1775   b     = (Mat_SeqAIJ*)B->data;
1776   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1777   B->info.fill_ratio_given  = info->fill;
1778   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1779 
1780   B->offloadmask            = PETSC_OFFLOAD_GPU;
1781   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1782   PetscFunctionReturn(0);
1783 }
1784 
1785 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1786 {
1787   PetscErrorCode   ierr;
1788   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1789   const PetscInt   nrows   = A->rmap->n;
1790 
1791   PetscFunctionBegin;
1792   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1793   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1794   // move B data into Kokkos
1795   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1796   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1797   {
1798     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1799     if (!baijkok->diag_d) {
1800       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1801       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1802       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1803     }
1804   }
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1809 {
1810   PetscFunctionBegin;
1811   *type = MATSOLVERKOKKOS;
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1816 {
1817   PetscFunctionBegin;
1818   *type = MATSOLVERKOKKOSDEVICE;
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 /*MC
1823   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1824   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1825 
1826   Level: beginner
1827 
1828 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1829 M*/
1830 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1831 {
1832   PetscErrorCode ierr;
1833   PetscInt       n = A->rmap->n;
1834 
1835   PetscFunctionBegin;
1836   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1837   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1838   (*B)->factortype = ftype;
1839   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1840   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1841 
1842   if (ftype == MAT_FACTOR_LU) {
1843     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1844     (*B)->canuseordering         = PETSC_TRUE;
1845     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1846   } else if (ftype == MAT_FACTOR_ILU) {
1847     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1848     (*B)->canuseordering         = PETSC_FALSE;
1849     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1850   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1851 
1852   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1853   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1854   PetscFunctionReturn(0);
1855 }
1856 
1857 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1858 {
1859   PetscErrorCode ierr;
1860   PetscInt       n = A->rmap->n;
1861 
1862   PetscFunctionBegin;
1863   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1864   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1865   (*B)->factortype = ftype;
1866   (*B)->canuseordering = PETSC_TRUE;
1867   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1868   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1869 
1870   if (ftype == MAT_FACTOR_LU) {
1871     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1872     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1873   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1874 
1875   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1876   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1877   PetscFunctionReturn(0);
1878 }
1879 
1880 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1881 {
1882   PetscErrorCode ierr;
1883 
1884   PetscFunctionBegin;
1885   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1886   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1887   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 /* Utility to print out a KokkosCsrMatrix for debugging */
1892 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1893 {
1894   PetscErrorCode    ierr;
1895   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1896   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1897   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1898   const PetscInt    *i = iv.data();
1899   const PetscInt    *j = jv.data();
1900   const PetscScalar *a = av.data();
1901   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1902 
1903   PetscFunctionBegin;
1904   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1905   for (PetscInt k=0; k<m; k++) {
1906     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1907     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1908       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1909     }
1910     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1911   }
1912   PetscFunctionReturn(0);
1913 }
1914