xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 076ba34a670039b3127ea98935d34fd539c66ade)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
85   PetscFunctionReturn(0);
86 }
87 
88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89 {
90   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91 
92   PetscFunctionBegin;
93   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
94    /* We do not expect one needs factors on host  */
95   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97   aijkok->a_dual.sync_host();
98   PetscFunctionReturn(0);
99 }
100 
101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102 {
103   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104 
105   PetscFunctionBegin;
106   if (aijkok) {
107     aijkok->a_dual.sync_host();
108     *array = aijkok->a_dual.view_host().data();
109   } else { /* Happens when calling MatSetValues on a newly created matrix */
110     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111   }
112   PetscFunctionReturn(0);
113 }
114 
115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116 {
117   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118 
119   PetscFunctionBegin;
120   if (aijkok) aijkok->a_dual.modify_host();
121   PetscFunctionReturn(0);
122 }
123 
124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127 
128   PetscFunctionBegin;
129   aijkok->a_dual.sync_host();
130   *array = aijkok->a_dual.view_host().data();
131   PetscFunctionReturn(0);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135 {
136   PetscFunctionBegin;
137   *array = NULL;
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142 {
143   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144 
145   PetscFunctionBegin;
146   *array = aijkok->a_dual.view_host().data();
147   PetscFunctionReturn(0);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151 {
152   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153 
154   PetscFunctionBegin;
155   aijkok->a_dual.clear_sync_state();
156   aijkok->a_dual.modify_host();
157   PetscFunctionReturn(0);
158 }
159 
160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162 {
163   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165 
166   PetscFunctionBegin;
167   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170   PetscFunctionReturn(0);
171 }
172 
173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175 {
176   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177 
178   PetscFunctionBegin;
179   if (aijkok && aijkok->device_mat_d.data()) {
180     *d_mat = aijkok->device_mat_d.data();
181   } else {
182     PetscErrorCode   ierr;
183     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184     *d_mat  = NULL;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 /* Generate the transpose on device and cache it internally */
190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191 {
192   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193 
194   PetscFunctionBegin;
195   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198     CHKERRCXX(aijkok->a_dual.sync_device());
199     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
201     aijkok->transpose_updated = PETSC_TRUE;
202   }
203   *csrmatT = &aijkok->csrmatT;
204   PetscFunctionReturn(0);
205 }
206 
207 /* Generate the Hermitian on device and cache it internally */
208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209 {
210   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211 
212   PetscFunctionBegin;
213   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215     CHKERRCXX(aijkok->a_dual.sync_device());
216     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218    #if defined(PETSC_USE_COMPLEX)
219     const auto& a = aijkok->csrmatH.values;
220     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221    #endif
222     aijkok->hermitian_updated = PETSC_TRUE;
223   }
224   *csrmatH = &aijkok->csrmatH;
225   PetscFunctionReturn(0);
226 }
227 
228 /* y = A x */
229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
230 {
231   PetscErrorCode                   ierr;
232   Mat_SeqAIJKokkos                 *aijkok;
233   ConstPetscScalarKokkosView       xv;
234   PetscScalarKokkosView            yv;
235 
236   PetscFunctionBegin;
237   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
240   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244   ierr   = WaitForKokkos();CHKERRQ(ierr);
245   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
247   PetscFunctionReturn(0);
248 }
249 
250 /* y = A^T x */
251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
252 {
253   PetscErrorCode                   ierr;
254   Mat_SeqAIJKokkos                 *aijkok;
255   const char                       *mode;
256   ConstPetscScalarKokkosView       xv;
257   PetscScalarKokkosView            yv;
258   KokkosCsrMatrix                  *csrmat;
259 
260   PetscFunctionBegin;
261   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264   if (A->form_explicit_transpose) {
265     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266     mode = "N";
267   } else {
268     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269     csrmat = &aijkok->csrmat;
270     mode = "T";
271   }
272   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275   ierr = WaitForKokkos();CHKERRQ(ierr);
276   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* y = A^H x */
281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
282 {
283   PetscErrorCode                   ierr;
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294   if (A->form_explicit_transpose) {
295     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296     mode = "N";
297   } else {
298     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299     csrmat = &aijkok->csrmat;
300     mode = "C";
301   }
302   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305   ierr = WaitForKokkos();CHKERRQ(ierr);
306   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 /* z = A x + y */
311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
312 {
313   PetscErrorCode                   ierr;
314   Mat_SeqAIJKokkos                 *aijkok;
315   ConstPetscScalarKokkosView       xv,yv;
316   PetscScalarKokkosView            zv;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
323   if (zz != yy) Kokkos::deep_copy(zv,yv);
324   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329   ierr = WaitForKokkos();CHKERRQ(ierr);
330   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 /* z = A^T x + y */
335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
336 {
337   PetscErrorCode                   ierr;
338   Mat_SeqAIJKokkos                 *aijkok;
339   const char                       *mode;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342   KokkosCsrMatrix                  *csrmat;
343 
344   PetscFunctionBegin;
345   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   if (A->form_explicit_transpose) {
351     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352     mode = "N";
353   } else {
354     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355     csrmat = &aijkok->csrmat;
356     mode = "T";
357   }
358   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362   ierr = WaitForKokkos();CHKERRQ(ierr);
363   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 
367 /* z = A^H x + y */
368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
369 {
370   PetscErrorCode                   ierr;
371   Mat_SeqAIJKokkos                 *aijkok;
372   const char                       *mode;
373   ConstPetscScalarKokkosView       xv,yv;
374   PetscScalarKokkosView            zv;
375   KokkosCsrMatrix                  *csrmat;
376 
377   PetscFunctionBegin;
378   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
382   if (zz != yy) Kokkos::deep_copy(zv,yv);
383   if (A->form_explicit_transpose) {
384     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385     mode = "N";
386   } else {
387     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388     csrmat = &aijkok->csrmat;
389     mode = "C";
390   }
391   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395   ierr = WaitForKokkos();CHKERRQ(ierr);
396   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401 {
402   PetscErrorCode            ierr;
403   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404 
405   PetscFunctionBegin;
406   switch (op) {
407     case MAT_FORM_EXPLICIT_TRANSPOSE:
408       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410       A->form_explicit_transpose = flg;
411       break;
412     default:
413       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414       break;
415   }
416   PetscFunctionReturn(0);
417 }
418 
419 /* Depending on reuse, either build a new mat, or use the existing mat */
420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
421 {
422   PetscErrorCode   ierr;
423   Mat_SeqAIJ       *aseq;
424 
425   PetscFunctionBegin;
426   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
429   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
441     }
442   }
443   PetscFunctionReturn(0);
444 }
445 
446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448  */
449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
450 {
451   PetscErrorCode        ierr;
452   Mat_SeqAIJ            *bseq;
453   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454   Mat                   mat;
455 
456   PetscFunctionBegin;
457   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459   mat  = *B;
460   if (A->assembled) {
461     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464     /* Now copy values to B if needed */
465     if (dupOption == MAT_COPY_VALUES) {
466       if (akok->a_dual.need_sync_device()) {
467         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468         bkok->a_dual.modify_host();
469       } else { /* If device has the latest data, we only copy data on device */
470         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471         bkok->a_dual.modify_device();
472       }
473     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475       bkok->a_dual.modify_host();
476     }
477     mat->spptr = bkok;
478   }
479 
480   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
488 {
489   PetscErrorCode             ierr;
490   Mat_SeqAIJKokkos           *aijkok;
491 
492   PetscFunctionBegin;
493   if (A->factortype == MAT_FACTOR_NONE) {
494     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
495     if (aijkok) {
496       if (aijkok->device_mat_d.data()) {
497         delete aijkok->colmap_d;
498         delete aijkok->i_uncompressed_d;
499       }
500       if (aijkok->diag_d) delete aijkok->diag_d;
501     }
502     delete aijkok;
503   } else {
504     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
505   }
506   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
507   A->spptr = NULL;
508   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
509   PetscFunctionReturn(0);
510 }
511 
512 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
513 {
514   PetscErrorCode ierr;
515 
516   PetscFunctionBegin;
517   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
518   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
519   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 
523 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
524 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
525 {
526   PetscErrorCode               ierr;
527   Mat_SeqAIJ                   *a,*b;
528   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
529   MatScalarKokkosView          aa,ba,ca;
530   MatRowMapKokkosView          ai,bi,ci;
531   MatColIdxKokkosView          aj,bj,cj;
532   PetscInt                     m,n,nnz,aN;
533 
534   PetscFunctionBegin;
535   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
536   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
537   PetscValidPointer(C,4);
538   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
539   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
540   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
541   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
542 
543   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
545   a    = static_cast<Mat_SeqAIJ*>(A->data);
546   b    = static_cast<Mat_SeqAIJ*>(B->data);
547   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
549   aa   = akok->a_dual.view_device();
550   ai   = akok->i_dual.view_device();
551   ba   = bkok->a_dual.view_device();
552   bi   = bkok->i_dual.view_device();
553   m    = A->rmap->n; /* M, N and nnz of C */
554   n    = A->cmap->n + B->cmap->n;
555   nnz  = a->nz + b->nz;
556   aN   = A->cmap->n; /* N of A */
557   if (reuse == MAT_INITIAL_MATRIX) {
558     aj = akok->j_dual.view_device();
559     bj = bkok->j_dual.view_device();
560     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
561     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
562     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
563     ca = ca_dual.view_device();
564     ci = ci_dual.view_device();
565     cj = cj_dual.view_device();
566 
567     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
568     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
569       PetscInt i = t.league_rank(); /* row i */
570       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
571 
572       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
573         ci(i) = coffset;
574         if (i == m-1) ci(m) = ai(m) + bi(m);
575       });
576 
577       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
578         if (k < alen) {
579           ca(coffset+k) = aa(ai(i)+k);
580           cj(coffset+k) = aj(ai(i)+k);
581         } else {
582           ca(coffset+k) = ba(bi(i)+k-alen);
583           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
584         }
585       });
586     });
587     ca_dual.modify_device();
588     ci_dual.modify_device();
589     cj_dual.modify_device();
590     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
591     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
592   } else if (reuse == MAT_REUSE_MATRIX) {
593     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
594     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
595     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
596     ca   = ckok->a_dual.view_device();
597     ci   = ckok->i_dual.view_device();
598 
599     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600       PetscInt i = t.league_rank(); /* row i */
601       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
603         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
604         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
605       });
606     });
607     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
608   }
609   PetscFunctionReturn(0);
610 }
611 
612 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
613 {
614   PetscFunctionBegin;
615   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
616   PetscFunctionReturn(0);
617 }
618 
619 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
620 {
621   PetscErrorCode                 ierr;
622   Mat_Product                    *product = C->product;
623   Mat                            A,B;
624   bool                           transA,transB; /* use bool, since KK needs this type */
625   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
626   Mat_SeqAIJ                     *c;
627   MatProductData_SeqAIJKokkos    *pdata;
628   KokkosCsrMatrix                *csrmatA,*csrmatB;
629 
630   PetscFunctionBegin;
631   MatCheckProduct(C,1);
632   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
633   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
634 
635   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
636     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
637     PetscFunctionReturn(0);
638   }
639 
640   switch (product->type) {
641     case MATPRODUCT_AB:  transA = false; transB = false; break;
642     case MATPRODUCT_AtB: transA = true;  transB = false; break;
643     case MATPRODUCT_ABt: transA = false; transB = true;  break;
644     default:
645       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
646   }
647 
648   A     = product->A;
649   B     = product->B;
650   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
651   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
652   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
653   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
654   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
655 
656   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
657 
658   csrmatA = &akok->csrmat;
659   csrmatB = &bkok->csrmat;
660 
661   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
662   if (transA) {
663     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
664     transA = false;
665   }
666 
667   if (transB) {
668     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
669     transB = false;
670   }
671 
672   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
673   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
674   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
675   /* shorter version of MatAssemblyEnd_SeqAIJ */
676   c = (Mat_SeqAIJ*)C->data;
677   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
678   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
679   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
680   c->reallocs         = 0;
681   C->info.mallocs     = 0;
682   C->info.nz_unneeded = 0;
683   C->assembled        = C->was_assembled = PETSC_TRUE;
684   C->num_ass++;
685   PetscFunctionReturn(0);
686 }
687 
688 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
689 {
690   PetscErrorCode                 ierr;
691   Mat_Product                    *product = C->product;
692   MatProductType                 ptype;
693   Mat                            A,B;
694   bool                           transA,transB;
695   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696   MatProductData_SeqAIJKokkos    *pdata;
697   MPI_Comm                       comm;
698   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
699 
700   PetscFunctionBegin;
701   MatCheckProduct(C,1);
702   ierr = PetscObjectGetComm((PetscObject)C,&comm);
703   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
704   A       = product->A;
705   B       = product->B;
706   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
708   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
709   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
710   csrmatA = &akok->csrmat;
711   csrmatB = &bkok->csrmat;
712 
713   ptype   = product->type;
714   switch (ptype) {
715     case MATPRODUCT_AB:  transA = false; transB = false; break;
716     case MATPRODUCT_AtB: transA = true;  transB = false; break;
717     case MATPRODUCT_ABt: transA = false; transB = true;  break;
718     default:
719       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
720   }
721 
722   product->data = pdata = new MatProductData_SeqAIJKokkos();
723   pdata->kh.set_team_work_size(16);
724   pdata->kh.set_dynamic_scheduling(true);
725   pdata->reusesym = product->api_user;
726 
727   /* TODO: add command line options to select spgemm algorithms */
728   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
729 #if defined(PETSC_HAVE_CUDA)
730   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
731     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
732     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
733   #endif
734 #endif
735   pdata->kh.create_spgemm_handle(spgemm_alg);
736 
737   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
738   if (transA) {
739     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
740     transA = false;
741   }
742 
743   if (transB) {
744     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
745     transB = false;
746   }
747 
748   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
749   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
750     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
751     calling new Mat_SeqAIJKokkos().
752     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
753   */
754   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
755   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
756 
757   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
758   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
759   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
760   PetscFunctionReturn(0);
761 }
762 
763 /* handles sparse matrix matrix ops */
764 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
765 {
766   PetscErrorCode ierr;
767   Mat_Product    *product = mat->product;
768   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
769 
770   PetscFunctionBegin;
771   MatCheckProduct(mat,1);
772   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
773   if (product->type == MATPRODUCT_ABC) {
774     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
775   }
776   if (Biskok && Ciskok) {
777     switch (product->type) {
778     case MATPRODUCT_AB:
779     case MATPRODUCT_AtB:
780     case MATPRODUCT_ABt:
781       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
782       break;
783     case MATPRODUCT_PtAP:
784     case MATPRODUCT_RARt:
785     case MATPRODUCT_ABC:
786       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
787       break;
788     default:
789       break;
790     }
791   } else { /* fallback for AIJ */
792     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
793   }
794   PetscFunctionReturn(0);
795 }
796 
797 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
798 {
799   PetscErrorCode   ierr;
800   Mat_SeqAIJKokkos *aijkok;
801 
802   PetscFunctionBegin;
803   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
804   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
805   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
806   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
807   ierr = WaitForKokkos();CHKERRQ(ierr);
808   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
813 {
814   PetscErrorCode   ierr;
815   Mat_SeqAIJKokkos *aijkok;
816 
817   PetscFunctionBegin;
818   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
819   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
820   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
821   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
822   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
823   PetscFunctionReturn(0);
824 }
825 
826 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
827 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
828 {
829   PetscErrorCode     ierr;
830   Mat_SeqAIJKokkos   *aijkok;
831 
832   PetscFunctionBegin;
833   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
834   PetscValidPointer(kv,2);
835   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
836   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
837   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
838   *kv    = aijkok->a_dual.view_device();
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
843 {
844   PetscFunctionBegin;
845   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
846   PetscValidPointer(kv,2);
847   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
848   PetscFunctionReturn(0);
849 }
850 
851 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
852 {
853   PetscErrorCode     ierr;
854   Mat_SeqAIJKokkos   *aijkok;
855 
856   PetscFunctionBegin;
857   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
858   PetscValidPointer(kv,2);
859   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
860   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
861   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
862   *kv    = aijkok->a_dual.view_device();
863   PetscFunctionReturn(0);
864 }
865 
866 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
867 {
868   PetscErrorCode     ierr;
869 
870   PetscFunctionBegin;
871   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
872   PetscValidPointer(kv,2);
873   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
874   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
875   PetscFunctionReturn(0);
876 }
877 
878 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
879 {
880   Mat_SeqAIJKokkos   *aijkok;
881 
882   PetscFunctionBegin;
883   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
884   PetscValidPointer(kv,2);
885   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
886   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887   *kv    = aijkok->a_dual.view_device();
888   PetscFunctionReturn(0);
889 }
890 
891 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
892 {
893   PetscErrorCode     ierr;
894 
895   PetscFunctionBegin;
896   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
897   PetscValidPointer(kv,2);
898   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
899   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
900   PetscFunctionReturn(0);
901 }
902 
903 /* Computes Y += a*X */
904 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
905 {
906   PetscErrorCode ierr;
907   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
908 
909   PetscFunctionBegin;
910   if (X->ops->axpy != Y->ops->axpy) {
911     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
912     PetscFunctionReturn(0);
913   }
914 
915   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
916     PetscBool e;
917     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
918     if (e) {
919       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
920       if (e) str = SAME_NONZERO_PATTERN;
921     }
922   }
923 
924   if (str != SAME_NONZERO_PATTERN) {
925     /* TODO: do MatAXPY on device */
926     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
927     PetscFunctionReturn(0);
928   } else {
929     ConstPetscScalarKokkosView xv;
930     PetscScalarKokkosView      yv;
931 
932     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
933     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
934     KokkosBlas::axpy(a,xv,yv);
935     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
936     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
937   }
938   PetscFunctionReturn(0);
939 }
940 
941 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
942 {
943   PetscErrorCode ierr;
944 
945   PetscFunctionBegin;
946   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
947   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
948   B->offloadmask = PETSC_OFFLOAD_CPU;
949   PetscFunctionReturn(0);
950 }
951 
952 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
953 {
954   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
955 
956   PetscFunctionBegin;
957   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
958   A->boundtocpu  = PETSC_FALSE;
959 
960   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
961   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
962   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
963   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
964   A->ops->scale                     = MatScale_SeqAIJKokkos;
965   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
966   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
967   A->ops->mult                      = MatMult_SeqAIJKokkos;
968   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
969   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
970   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
971   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
972   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
973   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
974   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
975   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
976   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
977   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
978   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
979   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
980   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
981   PetscFunctionReturn(0);
982 }
983 
984 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
985 {
986   PetscErrorCode     ierr;
987   Mat_SeqAIJ         *aseq;
988   PetscInt           i,m,n;
989 
990   PetscFunctionBegin;
991   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
992 
993   m    = akok->nrows();
994   n    = akok->ncols();
995   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
996   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
997 
998   /* Set up data structures of A as a MATSEQAIJ */
999   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1000   aseq = (Mat_SeqAIJ*)(A)->data;
1001 
1002   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1003   akok->j_dual.sync_host();
1004 
1005   aseq->i            = akok->i_host_data();
1006   aseq->j            = akok->j_host_data();
1007   aseq->a            = akok->a_host_data();
1008   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1009   aseq->singlemalloc = PETSC_FALSE;
1010   aseq->free_a       = PETSC_FALSE;
1011   aseq->free_ij      = PETSC_FALSE;
1012   aseq->nz           = akok->nnz();
1013   aseq->maxnz        = aseq->nz;
1014 
1015   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1016   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1017   for (i=0; i<m; i++) {
1018     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1019   }
1020 
1021   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1022   akok->nonzerostate = A->nonzerostate;
1023   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1024   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1025   A->spptr = akok;
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1030 
1031    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1032  */
1033 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1034 {
1035   PetscErrorCode     ierr;
1036 
1037   PetscFunctionBegin;
1038   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1039   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 /* --------------------------------------------------------------------------------*/
1044 /*@C
1045    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1046    (the default parallel PETSc format). This matrix will ultimately be handled by
1047    Kokkos for calculations. For good matrix
1048    assembly performance the user should preallocate the matrix storage by setting
1049    the parameter nz (or the array nnz).  By setting these parameters accurately,
1050    performance during matrix assembly can be increased by more than a factor of 50.
1051 
1052    Collective
1053 
1054    Input Parameters:
1055 +  comm - MPI communicator, set to PETSC_COMM_SELF
1056 .  m - number of rows
1057 .  n - number of columns
1058 .  nz - number of nonzeros per row (same for all rows)
1059 -  nnz - array containing the number of nonzeros in the various rows
1060          (possibly different for each row) or NULL
1061 
1062    Output Parameter:
1063 .  A - the matrix
1064 
1065    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1066    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1067    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1068 
1069    Notes:
1070    If nnz is given then nz is ignored
1071 
1072    The AIJ format (also called the Yale sparse matrix format or
1073    compressed row storage), is fully compatible with standard Fortran 77
1074    storage.  That is, the stored row and column indices can begin at
1075    either one (as in Fortran) or zero.  See the users' manual for details.
1076 
1077    Specify the preallocated storage with either nz or nnz (not both).
1078    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1079    allocation.  For large problems you MUST preallocate memory or you
1080    will get TERRIBLE performance, see the users' manual chapter on matrices.
1081 
1082    By default, this format uses inodes (identical nodes) when possible, to
1083    improve numerical efficiency of matrix-vector products and solves. We
1084    search for consecutive rows with the same nonzero structure, thereby
1085    reusing matrix information to achieve increased efficiency.
1086 
1087    Level: intermediate
1088 
1089 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1090 @*/
1091 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1092 {
1093   PetscErrorCode ierr;
1094 
1095   PetscFunctionBegin;
1096   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1097   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1098   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1099   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1100   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1101   PetscFunctionReturn(0);
1102 }
1103 
1104 typedef Kokkos::TeamPolicy<>::member_type team_member;
1105 //
1106 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1107 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1108 //
1109 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1110 {
1111   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1112   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1113   IS                 isrow = b->row,isicol = b->icol;
1114   PetscErrorCode     ierr;
1115   const PetscInt     *r_h,*ic_h;
1116   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1117   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1118   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1119   PetscBool          row_identity,col_identity;
1120   PetscInt           nc, Nf, nVec=32; // should be a parameter
1121   PetscContainer     container;
1122 
1123   PetscFunctionBegin;
1124   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
1125   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1126   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1127   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1128   if (container) {
1129     PetscInt *pNf=NULL, nv;
1130     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1131     Nf = (*pNf)%1000;
1132     nv = (*pNf)/1000;
1133     if (nv>0) nVec = nv;
1134   } else Nf = 1;
1135   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
1136   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1137   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1138   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1139 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1140   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1141 #endif
1142   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1143   {
1144 #define KOKKOS_SHARED_LEVEL 1
1145     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1146     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1147     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1148     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1149     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1150     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1151     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1152     size_t flops_h = 0.0;
1153     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1154     Kokkos::View<size_t> d_flops_k ("flops");
1155     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1156     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1157     Kokkos::deep_copy (d_flops_k, h_flops_k);
1158     Kokkos::deep_copy (d_r_k, h_r_k);
1159     Kokkos::deep_copy (d_ic_k, h_ic_k);
1160     // Fill A --> fact
1161     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1162         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1163         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1164         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1165         // zero rows of B
1166         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1167             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1168             PetscScalar *baL = ba_d + bi_d[rowb];
1169             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1170             /* zero (unfactored row) */
1171             for (int j=0;j<nzbL;j++) baL[j] = 0;
1172             for (int j=0;j<nzbU;j++) baU[j] = 0;
1173           });
1174         // copy A into B
1175         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1176             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1177             const PetscScalar *av    = aa_d + ai_d[rowa];
1178             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1179             /* load in initial (unfactored row) */
1180             for (int j=0;j<nza;j++) {
1181               PetscInt    colb = ic[ajtmp[j]];
1182               PetscScalar vala = av[j];
1183               if (colb == rowb) {
1184                 *(ba_d + bdiag_d[rowb]) = vala;
1185               } else {
1186                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1187                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1188                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1189                 for (int j=0; j<nz ; j++) {
1190                   if (pbj[j] == colb) {
1191                     pba[j] = vala;
1192                     set++;
1193                     break;
1194                   }
1195                 }
1196                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1197               }
1198             }
1199           });
1200       });
1201     Kokkos::fence();
1202 
1203     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1204         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1205         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1206         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1207         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1208         const PetscInt  start = field*nloc, end = start + nloc;
1209         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1210         // A22 panel update for each row A(1,:) and col A(:,1)
1211         for (int ii=start; ii<end-1; ii++) {
1212           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1213           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1214           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1215           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1216           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1217               PetscInt kIdx = j*Ni + field_block_idx;
1218               if (kIdx >= nzUi) /* void */ ;
1219               else {
1220                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1221                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1222                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1223                 size_t         st_idx;
1224                 // find and do L(k,i) = A(:k,i) / A(i,i)
1225                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1226                 // get column, there has got to be a better way
1227                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1228                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
1229                     if (pjL[j] == ii) {
1230                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1231                       idx = j; // output
1232                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1233                     }
1234                   }, st_idx);
1235                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1236                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
1237                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1238                   // U(i+1,:end)
1239                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1240                       PetscScalar Uij = baUi[uiIdx];
1241                       PetscInt    col = bjUi[uiIdx];
1242                       if (col==myk) {
1243                         // A_kk = A_kk - L_ki * U_ij(k)
1244                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1245                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1246                       } else {
1247                         PetscScalar    *start, *end, *pAkjv=NULL;
1248                         PetscInt       high, low;
1249                         const PetscInt *startj;
1250                         if (col<myk) { // L
1251                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1252                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1253                           start = pLki+1; // start at pLki+1, A22(myk,1)
1254                           startj= bj_d + bi_d[myk] + idx;
1255                           end   = ba_d + bi_d[myk+1];
1256                         } else {
1257                           PetscInt idx = bdiag_d[myk+1]+1;
1258                           start = ba_d + idx;
1259                           startj= bj_d + idx;
1260                           end   = ba_d + bdiag_d[myk];
1261                         }
1262                         // search for 'col', use bisection search - TODO
1263                         low  = 0;
1264                         high = (PetscInt)(end-start);
1265                         while (high-low > 5) {
1266                           int t = (low+high)/2;
1267                           if (startj[t] > col) high = t;
1268                           else                 low  = t;
1269                         }
1270                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1271                           if (startj[pAkjv-start] == col) break;
1272                         }
1273                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
1274                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1275                       }
1276                     });
1277                 }
1278               }
1279             });
1280           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1281           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1282         } /* endof for (i=0; i<n; i++) { */
1283         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1284       });
1285     Kokkos::fence();
1286     Kokkos::deep_copy (h_flops_k, d_flops_k);
1287 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1288     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1289 #elif  defined(PETSC_USE_LOG)
1290     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1291 #endif
1292     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1293         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1294         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1295         /* Invert diagonal for simpler triangular solves */
1296         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1297             int i = start + outer_index*Ni + lg_rank%Ni;
1298             if (i < end) {
1299               PetscScalar *pv = ba_d + bdiag_d[i];
1300               *pv = 1.0/(*pv);
1301             }
1302           });
1303       });
1304   }
1305 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1306   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1307 #endif
1308   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1309   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1310 
1311   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1312   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1313   if (b->inode.size) {
1314     B->ops->solve = MatSolve_SeqAIJ_Inode;
1315   } else if (row_identity && col_identity) {
1316     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1317   } else {
1318     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1319   }
1320   B->offloadmask = PETSC_OFFLOAD_GPU;
1321   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1322   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1323   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1324   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1325   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1326   B->assembled              = PETSC_TRUE;
1327   B->preallocated           = PETSC_TRUE;
1328 
1329   PetscFunctionReturn(0);
1330 }
1331 
1332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1333 {
1334   PetscErrorCode   ierr;
1335 
1336   PetscFunctionBegin;
1337   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1338   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1343 {
1344   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1345 
1346   PetscFunctionBegin;
1347   if (!factors->sptrsv_symbolic_completed) {
1348     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1349     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1350     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1351   }
1352   PetscFunctionReturn(0);
1353 }
1354 
1355 /* Check if we need to update factors etc for transpose solve */
1356 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1357 {
1358   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1359   MatColIdxType             n = A->rmap->n;
1360 
1361   PetscFunctionBegin;
1362   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1363     /* Update L^T and do sptrsv symbolic */
1364     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1365     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1366     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1367     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1368 
1369     KokkosKernels::Impl::transpose_matrix<
1370       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1371       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1372       MatRowMapKokkosView,DefaultExecutionSpace>(
1373         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1374         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1375 
1376     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1377       We have to sort the indices, until KK provides finer control options.
1378     */
1379     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1380       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1381         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1382 
1383     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1384 
1385     /* Update U^T and do sptrsv symbolic */
1386     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1387     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1388     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1389     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1390 
1391     KokkosKernels::Impl::transpose_matrix<
1392       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1393       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1394       MatRowMapKokkosView,DefaultExecutionSpace>(
1395         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1396         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1397 
1398     /* Sort indices. See comments above */
1399     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1400       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1401         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1402 
1403     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1404     factors->transpose_updated = PETSC_TRUE;
1405   }
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 /* Solve Ax = b, with A = LU */
1410 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1411 {
1412   PetscErrorCode                 ierr;
1413   ConstPetscScalarKokkosView     bv;
1414   PetscScalarKokkosView          xv;
1415   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1416 
1417   PetscFunctionBegin;
1418   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1419   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1420   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1421   /* Solve L tmpv = b */
1422   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1423   /* Solve Ux = tmpv */
1424   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1425   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1426   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 /* Solve A^T x = b, where A^T = U^T L^T */
1431 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1432 {
1433   PetscErrorCode                 ierr;
1434   ConstPetscScalarKokkosView     bv;
1435   PetscScalarKokkosView          xv;
1436   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1437 
1438   PetscFunctionBegin;
1439   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1440   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1441   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1442   /* Solve U^T tmpv = b */
1443   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1444 
1445   /* Solve L^T x = tmpv */
1446   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1447   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1448   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1449   PetscFunctionReturn(0);
1450 }
1451 
1452 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1453 {
1454   PetscErrorCode                 ierr;
1455   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1456   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1457   PetscInt                       fill_lev = info->levels;
1458 
1459   PetscFunctionBegin;
1460   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1461 
1462   auto a_d = aijkok->a_dual.view_device();
1463   auto i_d = aijkok->i_dual.view_device();
1464   auto j_d = aijkok->j_dual.view_device();
1465 
1466   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1467 
1468   B->assembled                       = PETSC_TRUE;
1469   B->preallocated                    = PETSC_TRUE;
1470   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1471   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1472   B->ops->matsolve                   = NULL;
1473   B->ops->matsolvetranspose          = NULL;
1474   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1475 
1476   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1477   factors->transpose_updated         = PETSC_FALSE;
1478   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1483 {
1484   PetscErrorCode                 ierr;
1485   Mat_SeqAIJKokkos               *aijkok;
1486   Mat_SeqAIJ                     *b;
1487   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1488   PetscInt                       fill_lev = info->levels;
1489   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1490   PetscInt                       n = A->rmap->n;
1491 
1492   PetscFunctionBegin;
1493   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1494   /* Rebuild factors */
1495   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1496   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1497 
1498   /* Create a spiluk handle and then do symbolic factorization */
1499   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1500   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1501 
1502   auto spiluk_handle = factors->kh.get_spiluk_handle();
1503 
1504   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1505   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1506   Kokkos::realloc(factors->iU_d,n+1);
1507   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1508 
1509   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1510   auto i_d = aijkok->i_dual.view_device();
1511   auto j_d = aijkok->j_dual.view_device();
1512   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1513   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1514 
1515   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1516   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1517   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1518   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1519 
1520   /* TODO: add options to select sptrsv algorithms */
1521   /* Create sptrsv handles for L, U and their transpose */
1522  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1523   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1524  #else
1525   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1526  #endif
1527 
1528   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1529   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1530   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1531   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1532 
1533   /* Fill fields of the factor matrix B */
1534   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1535   b     = (Mat_SeqAIJ*)B->data;
1536   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1537   B->info.fill_ratio_given  = info->fill;
1538   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1539 
1540   B->offloadmask            = PETSC_OFFLOAD_GPU;
1541   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1546 {
1547   PetscErrorCode   ierr;
1548   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1549   const PetscInt   nrows   = A->rmap->n;
1550 
1551   PetscFunctionBegin;
1552   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1553   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1554   // move B data into Kokkos
1555   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1556   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1557   {
1558     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1559     if (!baijkok->diag_d) {
1560       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1561       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1562       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1563     }
1564   }
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1569 {
1570   PetscFunctionBegin;
1571   *type = MATSOLVERKOKKOS;
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1576 {
1577   PetscFunctionBegin;
1578   *type = MATSOLVERKOKKOSDEVICE;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 /*MC
1583   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1584   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1585 
1586   Level: beginner
1587 
1588 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1589 M*/
1590 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1591 {
1592   PetscErrorCode ierr;
1593   PetscInt       n = A->rmap->n;
1594 
1595   PetscFunctionBegin;
1596   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1597   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1598   (*B)->factortype = ftype;
1599   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1600   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1601 
1602   if (ftype == MAT_FACTOR_LU) {
1603     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1604     (*B)->canuseordering         = PETSC_TRUE;
1605     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1606   } else if (ftype == MAT_FACTOR_ILU) {
1607     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1608     (*B)->canuseordering         = PETSC_FALSE;
1609     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1610   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1611 
1612   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1613   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1618 {
1619   PetscErrorCode ierr;
1620   PetscInt       n = A->rmap->n;
1621 
1622   PetscFunctionBegin;
1623   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1624   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1625   (*B)->factortype = ftype;
1626   (*B)->canuseordering = PETSC_TRUE;
1627   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1628   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1629 
1630   if (ftype == MAT_FACTOR_LU) {
1631     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1632     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1633   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1634 
1635   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1636   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1641 {
1642   PetscErrorCode ierr;
1643 
1644   PetscFunctionBegin;
1645   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1646   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1647   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1648   PetscFunctionReturn(0);
1649 }
1650 
1651 /* Utility to print out a KokkosCsrMatrix for debugging */
1652 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1653 {
1654   PetscErrorCode    ierr;
1655   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1656   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1657   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1658   const PetscInt    *i = iv.data();
1659   const PetscInt    *j = jv.data();
1660   const PetscScalar *a = av.data();
1661   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1662 
1663   PetscFunctionBegin;
1664   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1665   for (PetscInt k=0; k<m; k++) {
1666     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1667     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1668       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1669     }
1670     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1671   }
1672   PetscFunctionReturn(0);
1673 }
1674