xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c17cf69914952e266e86d7403471480e6d7133d3)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petscsystypes.h>
5 #include <petscerror.h>
6 
7 #include <Kokkos_Core.hpp>
8 #include <KokkosBlas.hpp>
9 #include <KokkosKernels_Sorting.hpp>
10 #include <KokkosSparse_CrsMatrix.hpp>
11 #include <KokkosSparse_spmv.hpp>
12 #include <KokkosSparse_spiluk.hpp>
13 #include <KokkosSparse_sptrsv.hpp>
14 #include <KokkosSparse_spgemm.hpp>
15 #include <KokkosSparse_spadd.hpp>
16 
17 #include <../src/mat/impls/aij/seq/aij.h>
18 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
85   PetscFunctionReturn(0);
86 }
87 
88 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89 {
90   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91 
92   PetscFunctionBegin;
93   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
94    /* We do not expect one needs factors on host  */
95   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97   aijkok->a_dual.sync_host();
98   PetscFunctionReturn(0);
99 }
100 
101 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102 {
103   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104 
105   PetscFunctionBegin;
106   if (aijkok) {
107     aijkok->a_dual.sync_host();
108     *array = aijkok->a_dual.view_host().data();
109   } else { /* Happens when calling MatSetValues on a newly created matrix */
110     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111   }
112   PetscFunctionReturn(0);
113 }
114 
115 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116 {
117   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118 
119   PetscFunctionBegin;
120   if (aijkok) aijkok->a_dual.modify_host();
121   PetscFunctionReturn(0);
122 }
123 
124 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125 {
126   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127 
128   PetscFunctionBegin;
129   aijkok->a_dual.sync_host();
130   *array = aijkok->a_dual.view_host().data();
131   PetscFunctionReturn(0);
132 }
133 
134 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135 {
136   PetscFunctionBegin;
137   *array = NULL;
138   PetscFunctionReturn(0);
139 }
140 
141 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142 {
143   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144 
145   PetscFunctionBegin;
146   *array = aijkok->a_dual.view_host().data();
147   PetscFunctionReturn(0);
148 }
149 
150 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151 {
152   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153 
154   PetscFunctionBegin;
155   aijkok->a_dual.clear_sync_state();
156   aijkok->a_dual.modify_host();
157   PetscFunctionReturn(0);
158 }
159 
160 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162 {
163   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165 
166   PetscFunctionBegin;
167   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170   PetscFunctionReturn(0);
171 }
172 
173 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175 {
176   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177 
178   PetscFunctionBegin;
179   if (aijkok && aijkok->device_mat_d.data()) {
180     *d_mat = aijkok->device_mat_d.data();
181   } else {
182     PetscErrorCode   ierr;
183     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184     *d_mat  = NULL;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 /* Generate the transpose on device and cache it internally */
190 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191 {
192   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193 
194   PetscFunctionBegin;
195   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198     CHKERRCXX(aijkok->a_dual.sync_device());
199     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
201     aijkok->transpose_updated = PETSC_TRUE;
202   }
203   *csrmatT = &aijkok->csrmatT;
204   PetscFunctionReturn(0);
205 }
206 
207 /* Generate the Hermitian on device and cache it internally */
208 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209 {
210   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211 
212   PetscFunctionBegin;
213   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215     CHKERRCXX(aijkok->a_dual.sync_device());
216     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218    #if defined(PETSC_USE_COMPLEX)
219     const auto& a = aijkok->csrmatH.values;
220     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221    #endif
222     aijkok->hermitian_updated = PETSC_TRUE;
223   }
224   *csrmatH = &aijkok->csrmatH;
225   PetscFunctionReturn(0);
226 }
227 
228 /* y = A x */
229 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
230 {
231   PetscErrorCode                   ierr;
232   Mat_SeqAIJKokkos                 *aijkok;
233   ConstPetscScalarKokkosView       xv;
234   PetscScalarKokkosView            yv;
235 
236   PetscFunctionBegin;
237   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
240   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244   ierr   = WaitForKokkos();CHKERRQ(ierr);
245   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
247   PetscFunctionReturn(0);
248 }
249 
250 /* y = A^T x */
251 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
252 {
253   PetscErrorCode                   ierr;
254   Mat_SeqAIJKokkos                 *aijkok;
255   const char                       *mode;
256   ConstPetscScalarKokkosView       xv;
257   PetscScalarKokkosView            yv;
258   KokkosCsrMatrix                  *csrmat;
259 
260   PetscFunctionBegin;
261   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264   if (A->form_explicit_transpose) {
265     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266     mode = "N";
267   } else {
268     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269     csrmat = &aijkok->csrmat;
270     mode = "T";
271   }
272   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275   ierr = WaitForKokkos();CHKERRQ(ierr);
276   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
280 /* y = A^H x */
281 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
282 {
283   PetscErrorCode                   ierr;
284   Mat_SeqAIJKokkos                 *aijkok;
285   const char                       *mode;
286   ConstPetscScalarKokkosView       xv;
287   PetscScalarKokkosView            yv;
288   KokkosCsrMatrix                  *csrmat;
289 
290   PetscFunctionBegin;
291   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294   if (A->form_explicit_transpose) {
295     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296     mode = "N";
297   } else {
298     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299     csrmat = &aijkok->csrmat;
300     mode = "C";
301   }
302   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305   ierr = WaitForKokkos();CHKERRQ(ierr);
306   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 /* z = A x + y */
311 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
312 {
313   PetscErrorCode                   ierr;
314   Mat_SeqAIJKokkos                 *aijkok;
315   ConstPetscScalarKokkosView       xv,yv;
316   PetscScalarKokkosView            zv;
317 
318   PetscFunctionBegin;
319   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
323   if (zz != yy) Kokkos::deep_copy(zv,yv);
324   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329   ierr = WaitForKokkos();CHKERRQ(ierr);
330   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 /* z = A^T x + y */
335 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
336 {
337   PetscErrorCode                   ierr;
338   Mat_SeqAIJKokkos                 *aijkok;
339   const char                       *mode;
340   ConstPetscScalarKokkosView       xv,yv;
341   PetscScalarKokkosView            zv;
342   KokkosCsrMatrix                  *csrmat;
343 
344   PetscFunctionBegin;
345   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
349   if (zz != yy) Kokkos::deep_copy(zv,yv);
350   if (A->form_explicit_transpose) {
351     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352     mode = "N";
353   } else {
354     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355     csrmat = &aijkok->csrmat;
356     mode = "T";
357   }
358   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362   ierr = WaitForKokkos();CHKERRQ(ierr);
363   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
364   PetscFunctionReturn(0);
365 }
366 
367 /* z = A^H x + y */
368 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
369 {
370   PetscErrorCode                   ierr;
371   Mat_SeqAIJKokkos                 *aijkok;
372   const char                       *mode;
373   ConstPetscScalarKokkosView       xv,yv;
374   PetscScalarKokkosView            zv;
375   KokkosCsrMatrix                  *csrmat;
376 
377   PetscFunctionBegin;
378   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
382   if (zz != yy) Kokkos::deep_copy(zv,yv);
383   if (A->form_explicit_transpose) {
384     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385     mode = "N";
386   } else {
387     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388     csrmat = &aijkok->csrmat;
389     mode = "C";
390   }
391   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395   ierr = WaitForKokkos();CHKERRQ(ierr);
396   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397   PetscFunctionReturn(0);
398 }
399 
400 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401 {
402   PetscErrorCode            ierr;
403   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404 
405   PetscFunctionBegin;
406   switch (op) {
407     case MAT_FORM_EXPLICIT_TRANSPOSE:
408       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410       A->form_explicit_transpose = flg;
411       break;
412     default:
413       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414       break;
415   }
416   PetscFunctionReturn(0);
417 }
418 
419 /* Depending on reuse, either build a new mat, or use the existing mat */
420 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
421 {
422   PetscErrorCode   ierr;
423   Mat_SeqAIJ       *aseq;
424 
425   PetscFunctionBegin;
426   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
429   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
441     }
442   }
443   PetscFunctionReturn(0);
444 }
445 
446 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448  */
449 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
450 {
451   PetscErrorCode        ierr;
452   Mat_SeqAIJ            *bseq;
453   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454   Mat                   mat;
455 
456   PetscFunctionBegin;
457   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459   mat  = *B;
460   if (A->assembled) {
461     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464     /* Now copy values to B if needed */
465     if (dupOption == MAT_COPY_VALUES) {
466       if (akok->a_dual.need_sync_device()) {
467         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468         bkok->a_dual.modify_host();
469       } else { /* If device has the latest data, we only copy data on device */
470         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471         bkok->a_dual.modify_device();
472       }
473     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475       bkok->a_dual.modify_host();
476     }
477     mat->spptr = bkok;
478   }
479 
480   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
488 {
489   PetscErrorCode             ierr;
490   Mat_SeqAIJKokkos           *aijkok;
491 
492   PetscFunctionBegin;
493   if (A->factortype == MAT_FACTOR_NONE) {
494     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
495     if (aijkok) {
496       if (aijkok->device_mat_d.data()) {
497         delete aijkok->colmap_d;
498         delete aijkok->i_uncompressed_d;
499       }
500       if (aijkok->diag_d) delete aijkok->diag_d;
501     }
502     delete aijkok;
503   } else {
504     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
505   }
506   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
507   A->spptr = NULL;
508   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
509   PetscFunctionReturn(0);
510 }
511 
512 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
513 {
514   PetscErrorCode ierr;
515 
516   PetscFunctionBegin;
517   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
518   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
519   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
520   PetscFunctionReturn(0);
521 }
522 
523 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
524 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
525 {
526   PetscErrorCode               ierr;
527   Mat_SeqAIJ                   *a,*b;
528   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
529   MatScalarKokkosView          aa,ba,ca;
530   MatRowMapKokkosView          ai,bi,ci;
531   MatColIdxKokkosView          aj,bj,cj;
532   PetscInt                     m,n,nnz,aN;
533 
534   PetscFunctionBegin;
535   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
536   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
537   PetscValidPointer(C,4);
538   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
539   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
540   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
541   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
542 
543   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
545   a    = static_cast<Mat_SeqAIJ*>(A->data);
546   b    = static_cast<Mat_SeqAIJ*>(B->data);
547   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
549   aa   = akok->a_dual.view_device();
550   ai   = akok->i_dual.view_device();
551   ba   = bkok->a_dual.view_device();
552   bi   = bkok->i_dual.view_device();
553   m    = A->rmap->n; /* M, N and nnz of C */
554   n    = A->cmap->n + B->cmap->n;
555   nnz  = a->nz + b->nz;
556   aN   = A->cmap->n; /* N of A */
557   if (reuse == MAT_INITIAL_MATRIX) {
558     aj = akok->j_dual.view_device();
559     bj = bkok->j_dual.view_device();
560     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
561     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
562     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
563     ca = ca_dual.view_device();
564     ci = ci_dual.view_device();
565     cj = cj_dual.view_device();
566 
567     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
568     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
569       PetscInt i = t.league_rank(); /* row i */
570       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
571 
572       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
573         ci(i) = coffset;
574         if (i == m-1) ci(m) = ai(m) + bi(m);
575       });
576 
577       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
578         if (k < alen) {
579           ca(coffset+k) = aa(ai(i)+k);
580           cj(coffset+k) = aj(ai(i)+k);
581         } else {
582           ca(coffset+k) = ba(bi(i)+k-alen);
583           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
584         }
585       });
586     });
587     ca_dual.modify_device();
588     ci_dual.modify_device();
589     cj_dual.modify_device();
590     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
591     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
592   } else if (reuse == MAT_REUSE_MATRIX) {
593     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
594     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
595     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
596     ca   = ckok->a_dual.view_device();
597     ci   = ckok->i_dual.view_device();
598 
599     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600       PetscInt i = t.league_rank(); /* row i */
601       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
603         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
604         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
605       });
606     });
607     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
608   }
609   PetscFunctionReturn(0);
610 }
611 
612 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
613 {
614   PetscFunctionBegin;
615   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
616   PetscFunctionReturn(0);
617 }
618 
619 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
620 {
621   PetscErrorCode                 ierr;
622   Mat_Product                    *product = C->product;
623   Mat                            A,B;
624   bool                           transA,transB; /* use bool, since KK needs this type */
625   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
626   Mat_SeqAIJ                     *c;
627   MatProductData_SeqAIJKokkos    *pdata;
628   KokkosCsrMatrix                *csrmatA,*csrmatB;
629 
630   PetscFunctionBegin;
631   MatCheckProduct(C,1);
632   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
633   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
634 
635   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
636     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
637     PetscFunctionReturn(0);
638   }
639 
640   switch (product->type) {
641     case MATPRODUCT_AB:  transA = false; transB = false; break;
642     case MATPRODUCT_AtB: transA = true;  transB = false; break;
643     case MATPRODUCT_ABt: transA = false; transB = true;  break;
644     default:
645       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
646   }
647 
648   A     = product->A;
649   B     = product->B;
650   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
651   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
652   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
653   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
654   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
655 
656   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
657 
658   csrmatA = &akok->csrmat;
659   csrmatB = &bkok->csrmat;
660 
661   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
662   if (transA) {
663     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
664     transA = false;
665   }
666 
667   if (transB) {
668     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
669     transB = false;
670   }
671 
672   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
673   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
674   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
675   /* shorter version of MatAssemblyEnd_SeqAIJ */
676   c = (Mat_SeqAIJ*)C->data;
677   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
678   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
679   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
680   c->reallocs         = 0;
681   C->info.mallocs     = 0;
682   C->info.nz_unneeded = 0;
683   C->assembled        = C->was_assembled = PETSC_TRUE;
684   C->num_ass++;
685   PetscFunctionReturn(0);
686 }
687 
688 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
689 {
690   PetscErrorCode                 ierr;
691   Mat_Product                    *product = C->product;
692   MatProductType                 ptype;
693   Mat                            A,B;
694   bool                           transA,transB;
695   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696   MatProductData_SeqAIJKokkos    *pdata;
697   MPI_Comm                       comm;
698   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
699 
700   PetscFunctionBegin;
701   MatCheckProduct(C,1);
702   ierr = PetscObjectGetComm((PetscObject)C,&comm);
703   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
704   A       = product->A;
705   B       = product->B;
706   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
708   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
709   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
710   csrmatA = &akok->csrmat;
711   csrmatB = &bkok->csrmat;
712 
713   ptype   = product->type;
714   switch (ptype) {
715     case MATPRODUCT_AB:  transA = false; transB = false; break;
716     case MATPRODUCT_AtB: transA = true;  transB = false; break;
717     case MATPRODUCT_ABt: transA = false; transB = true;  break;
718     default:
719       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
720   }
721 
722   product->data = pdata = new MatProductData_SeqAIJKokkos();
723   pdata->kh.set_team_work_size(16);
724   pdata->kh.set_dynamic_scheduling(true);
725   pdata->reusesym = product->api_user;
726 
727   /* TODO: add command line options to select spgemm algorithms */
728   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
729 #if defined(PETSC_HAVE_CUDA)
730   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
731     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
732     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
733   #endif
734 #endif
735   pdata->kh.create_spgemm_handle(spgemm_alg);
736 
737   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
738   if (transA) {
739     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
740     transA = false;
741   }
742 
743   if (transB) {
744     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
745     transB = false;
746   }
747 
748   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
749   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
750     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
751     calling new Mat_SeqAIJKokkos().
752     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
753   */
754   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
755   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
756 
757   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
758   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
759   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
760   PetscFunctionReturn(0);
761 }
762 
763 /* handles sparse matrix matrix ops */
764 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
765 {
766   PetscErrorCode ierr;
767   Mat_Product    *product = mat->product;
768   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
769 
770   PetscFunctionBegin;
771   MatCheckProduct(mat,1);
772   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
773   if (product->type == MATPRODUCT_ABC) {
774     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
775   }
776   if (Biskok && Ciskok) {
777     switch (product->type) {
778     case MATPRODUCT_AB:
779     case MATPRODUCT_AtB:
780     case MATPRODUCT_ABt:
781       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
782       break;
783     case MATPRODUCT_PtAP:
784     case MATPRODUCT_RARt:
785     case MATPRODUCT_ABC:
786       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
787       break;
788     default:
789       break;
790     }
791   } else { /* fallback for AIJ */
792     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
793   }
794   PetscFunctionReturn(0);
795 }
796 
797 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
798 {
799   PetscErrorCode   ierr;
800   Mat_SeqAIJKokkos *aijkok;
801 
802   PetscFunctionBegin;
803   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
804   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
805   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
806   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
807   ierr = WaitForKokkos();CHKERRQ(ierr);
808   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
813 {
814   PetscErrorCode   ierr;
815   Mat_SeqAIJKokkos *aijkok;
816 
817   PetscFunctionBegin;
818   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
819   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
820   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
821   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
822   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
823   PetscFunctionReturn(0);
824 }
825 
826 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
827 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
828 {
829   PetscErrorCode     ierr;
830   Mat_SeqAIJKokkos   *aijkok;
831 
832   PetscFunctionBegin;
833   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
834   PetscValidPointer(kv,2);
835   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
836   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
837   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
838   *kv    = aijkok->a_dual.view_device();
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
843 {
844   PetscFunctionBegin;
845   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
846   PetscValidPointer(kv,2);
847   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
848   PetscFunctionReturn(0);
849 }
850 
851 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
852 {
853   PetscErrorCode     ierr;
854   Mat_SeqAIJKokkos   *aijkok;
855 
856   PetscFunctionBegin;
857   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
858   PetscValidPointer(kv,2);
859   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
860   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
861   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
862   *kv    = aijkok->a_dual.view_device();
863   PetscFunctionReturn(0);
864 }
865 
866 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
867 {
868   PetscErrorCode     ierr;
869 
870   PetscFunctionBegin;
871   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
872   PetscValidPointer(kv,2);
873   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
874   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
875   PetscFunctionReturn(0);
876 }
877 
878 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
879 {
880   Mat_SeqAIJKokkos   *aijkok;
881 
882   PetscFunctionBegin;
883   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
884   PetscValidPointer(kv,2);
885   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
886   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887   *kv    = aijkok->a_dual.view_device();
888   PetscFunctionReturn(0);
889 }
890 
891 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
892 {
893   PetscErrorCode     ierr;
894 
895   PetscFunctionBegin;
896   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
897   PetscValidPointer(kv,2);
898   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
899   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
900   PetscFunctionReturn(0);
901 }
902 
903 /* Computes Y += alpha X */
904 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
905 {
906   PetscErrorCode             ierr;
907   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
908   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
909   ConstMatScalarKokkosView   Xa;
910   MatScalarKokkosView        Ya;
911 
912   PetscFunctionBegin;
913   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
914   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
915   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
916   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
917 
918   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
919     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
920     PetscBool e;
921     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
922     if (e) {
923       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
924       if (e) pattern = SAME_NONZERO_PATTERN;
925     }
926   }
927 
928   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
929     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
930     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
931     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
932   */
933   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
934   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
935   Xa   = xkok->a_dual.view_device();
936   Ya   = ykok->a_dual.view_device();
937 
938   if (pattern == SAME_NONZERO_PATTERN) {
939     KokkosBlas::axpy(alpha,Xa,Ya);
940     ierr = MatSeqAIJKokkosModifyDevice(Y);
941   } else if (pattern == SUBSET_NONZERO_PATTERN) {
942     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
943     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
944 
945     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
946       PetscInt i = t.league_rank(); /* row i */
947       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
948         PetscInt p,q = Yi(i);
949         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
950           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
951           if (Xj(p) == Yj(q)) { /* Found it */
952             Ya(q) += alpha * Xa(p);
953             q++;
954           } else {
955             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
956                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
957             */
958             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
959           }
960         }
961       });
962     });
963     ierr = MatSeqAIJKokkosModifyDevice(Y);
964   } else { /* different nonzero patterns */
965     Mat             Z;
966     KokkosCsrMatrix zcsr;
967     KernelHandle    kh;
968     kh.create_spadd_handle(false);
969     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
970     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
971     zkok = new Mat_SeqAIJKokkos(zcsr);
972     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
973     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
974     kh.destroy_spadd_handle();
975   }
976 
977   PetscFunctionReturn(0);
978 }
979 
980 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
981 {
982   PetscErrorCode ierr;
983 
984   PetscFunctionBegin;
985   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
986   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
987   B->offloadmask = PETSC_OFFLOAD_CPU;
988   PetscFunctionReturn(0);
989 }
990 
991 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
992 {
993   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
994 
995   PetscFunctionBegin;
996   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
997   A->boundtocpu  = PETSC_FALSE;
998 
999   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1000   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1001   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1002   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1003   A->ops->scale                     = MatScale_SeqAIJKokkos;
1004   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1005   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1006   A->ops->mult                      = MatMult_SeqAIJKokkos;
1007   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1008   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1009   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1010   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1011   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1012   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1013   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1014   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1015   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1016   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1017   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1018   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1019   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1024 {
1025   PetscErrorCode     ierr;
1026   Mat_SeqAIJ         *aseq;
1027   PetscInt           i,m,n;
1028 
1029   PetscFunctionBegin;
1030   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1031 
1032   m    = akok->nrows();
1033   n    = akok->ncols();
1034   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1035   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1036 
1037   /* Set up data structures of A as a MATSEQAIJ */
1038   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1039   aseq = (Mat_SeqAIJ*)(A)->data;
1040 
1041   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1042   akok->j_dual.sync_host();
1043 
1044   aseq->i            = akok->i_host_data();
1045   aseq->j            = akok->j_host_data();
1046   aseq->a            = akok->a_host_data();
1047   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1048   aseq->singlemalloc = PETSC_FALSE;
1049   aseq->free_a       = PETSC_FALSE;
1050   aseq->free_ij      = PETSC_FALSE;
1051   aseq->nz           = akok->nnz();
1052   aseq->maxnz        = aseq->nz;
1053 
1054   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1055   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1056   for (i=0; i<m; i++) {
1057     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1058   }
1059 
1060   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1061   akok->nonzerostate = A->nonzerostate;
1062   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1063   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1064   A->spptr = akok;
1065   PetscFunctionReturn(0);
1066 }
1067 
1068 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1069 
1070    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1071  */
1072 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1073 {
1074   PetscErrorCode     ierr;
1075 
1076   PetscFunctionBegin;
1077   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1078   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 /* --------------------------------------------------------------------------------*/
1083 /*@C
1084    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1085    (the default parallel PETSc format). This matrix will ultimately be handled by
1086    Kokkos for calculations. For good matrix
1087    assembly performance the user should preallocate the matrix storage by setting
1088    the parameter nz (or the array nnz).  By setting these parameters accurately,
1089    performance during matrix assembly can be increased by more than a factor of 50.
1090 
1091    Collective
1092 
1093    Input Parameters:
1094 +  comm - MPI communicator, set to PETSC_COMM_SELF
1095 .  m - number of rows
1096 .  n - number of columns
1097 .  nz - number of nonzeros per row (same for all rows)
1098 -  nnz - array containing the number of nonzeros in the various rows
1099          (possibly different for each row) or NULL
1100 
1101    Output Parameter:
1102 .  A - the matrix
1103 
1104    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1105    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1106    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1107 
1108    Notes:
1109    If nnz is given then nz is ignored
1110 
1111    The AIJ format (also called the Yale sparse matrix format or
1112    compressed row storage), is fully compatible with standard Fortran 77
1113    storage.  That is, the stored row and column indices can begin at
1114    either one (as in Fortran) or zero.  See the users' manual for details.
1115 
1116    Specify the preallocated storage with either nz or nnz (not both).
1117    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1118    allocation.  For large problems you MUST preallocate memory or you
1119    will get TERRIBLE performance, see the users' manual chapter on matrices.
1120 
1121    By default, this format uses inodes (identical nodes) when possible, to
1122    improve numerical efficiency of matrix-vector products and solves. We
1123    search for consecutive rows with the same nonzero structure, thereby
1124    reusing matrix information to achieve increased efficiency.
1125 
1126    Level: intermediate
1127 
1128 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1129 @*/
1130 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1131 {
1132   PetscErrorCode ierr;
1133 
1134   PetscFunctionBegin;
1135   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1136   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1137   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1138   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1139   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 typedef Kokkos::TeamPolicy<>::member_type team_member;
1144 //
1145 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
1146 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1147 //
1148 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1149 {
1150   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1151   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1152   IS                 isrow = b->row,isicol = b->icol;
1153   PetscErrorCode     ierr;
1154   const PetscInt     *r_h,*ic_h;
1155   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1156   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1157   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1158   PetscBool          row_identity,col_identity;
1159   PetscInt           nc, Nf, nVec=32; // should be a parameter
1160   PetscContainer     container;
1161 
1162   PetscFunctionBegin;
1163   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
1164   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1165   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1166   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
1167   if (container) {
1168     PetscInt *pNf=NULL, nv;
1169     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
1170     Nf = (*pNf)%1000;
1171     nv = (*pNf)/1000;
1172     if (nv>0) nVec = nv;
1173   } else Nf = 1;
1174   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
1175   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1176   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1177   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1178 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1179   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1180 #endif
1181   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1182   {
1183 #define KOKKOS_SHARED_LEVEL 1
1184     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1185     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1186     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1187     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1188     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1189     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1190     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1191     size_t flops_h = 0.0;
1192     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1193     Kokkos::View<size_t> d_flops_k ("flops");
1194     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1195     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1196     Kokkos::deep_copy (d_flops_k, h_flops_k);
1197     Kokkos::deep_copy (d_r_k, h_r_k);
1198     Kokkos::deep_copy (d_ic_k, h_ic_k);
1199     // Fill A --> fact
1200     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1201         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1202         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1203         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1204         // zero rows of B
1205         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1206             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1207             PetscScalar *baL = ba_d + bi_d[rowb];
1208             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1209             /* zero (unfactored row) */
1210             for (int j=0;j<nzbL;j++) baL[j] = 0;
1211             for (int j=0;j<nzbU;j++) baU[j] = 0;
1212           });
1213         // copy A into B
1214         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1215             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1216             const PetscScalar *av    = aa_d + ai_d[rowa];
1217             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1218             /* load in initial (unfactored row) */
1219             for (int j=0;j<nza;j++) {
1220               PetscInt    colb = ic[ajtmp[j]];
1221               PetscScalar vala = av[j];
1222               if (colb == rowb) {
1223                 *(ba_d + bdiag_d[rowb]) = vala;
1224               } else {
1225                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1226                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1227                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1228                 for (int j=0; j<nz ; j++) {
1229                   if (pbj[j] == colb) {
1230                     pba[j] = vala;
1231                     set++;
1232                     break;
1233                   }
1234                 }
1235                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1236               }
1237             }
1238           });
1239       });
1240     Kokkos::fence();
1241 
1242     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1243         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1244         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1245         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1246         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1247         const PetscInt  start = field*nloc, end = start + nloc;
1248         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1249         // A22 panel update for each row A(1,:) and col A(:,1)
1250         for (int ii=start; ii<end-1; ii++) {
1251           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1252           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1253           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1254           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1255           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1256               PetscInt kIdx = j*Ni + field_block_idx;
1257               if (kIdx >= nzUi) /* void */ ;
1258               else {
1259                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1260                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1261                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1262                 size_t         st_idx;
1263                 // find and do L(k,i) = A(:k,i) / A(i,i)
1264                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1265                 // get column, there has got to be a better way
1266                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1267                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
1268                     if (pjL[j] == ii) {
1269                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1270                       idx = j; // output
1271                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1272                     }
1273                   }, st_idx);
1274                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1275                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
1276                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1277                   // U(i+1,:end)
1278                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1279                       PetscScalar Uij = baUi[uiIdx];
1280                       PetscInt    col = bjUi[uiIdx];
1281                       if (col==myk) {
1282                         // A_kk = A_kk - L_ki * U_ij(k)
1283                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1284                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1285                       } else {
1286                         PetscScalar    *start, *end, *pAkjv=NULL;
1287                         PetscInt       high, low;
1288                         const PetscInt *startj;
1289                         if (col<myk) { // L
1290                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1291                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1292                           start = pLki+1; // start at pLki+1, A22(myk,1)
1293                           startj= bj_d + bi_d[myk] + idx;
1294                           end   = ba_d + bi_d[myk+1];
1295                         } else {
1296                           PetscInt idx = bdiag_d[myk+1]+1;
1297                           start = ba_d + idx;
1298                           startj= bj_d + idx;
1299                           end   = ba_d + bdiag_d[myk];
1300                         }
1301                         // search for 'col', use bisection search - TODO
1302                         low  = 0;
1303                         high = (PetscInt)(end-start);
1304                         while (high-low > 5) {
1305                           int t = (low+high)/2;
1306                           if (startj[t] > col) high = t;
1307                           else                 low  = t;
1308                         }
1309                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1310                           if (startj[pAkjv-start] == col) break;
1311                         }
1312                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
1313                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1314                       }
1315                     });
1316                 }
1317               }
1318             });
1319           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1320           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1321         } /* endof for (i=0; i<n; i++) { */
1322         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1323       });
1324     Kokkos::fence();
1325     Kokkos::deep_copy (h_flops_k, d_flops_k);
1326 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1327     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1328 #elif  defined(PETSC_USE_LOG)
1329     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1330 #endif
1331     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1332         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1333         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1334         /* Invert diagonal for simpler triangular solves */
1335         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1336             int i = start + outer_index*Ni + lg_rank%Ni;
1337             if (i < end) {
1338               PetscScalar *pv = ba_d + bdiag_d[i];
1339               *pv = 1.0/(*pv);
1340             }
1341           });
1342       });
1343   }
1344 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1345   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1346 #endif
1347   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1348   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1349 
1350   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1351   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1352   if (b->inode.size) {
1353     B->ops->solve = MatSolve_SeqAIJ_Inode;
1354   } else if (row_identity && col_identity) {
1355     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1356   } else {
1357     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1358   }
1359   B->offloadmask = PETSC_OFFLOAD_GPU;
1360   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1361   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1362   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1363   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1364   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1365   B->assembled              = PETSC_TRUE;
1366   B->preallocated           = PETSC_TRUE;
1367 
1368   PetscFunctionReturn(0);
1369 }
1370 
1371 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1372 {
1373   PetscErrorCode   ierr;
1374 
1375   PetscFunctionBegin;
1376   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1377   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1382 {
1383   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1384 
1385   PetscFunctionBegin;
1386   if (!factors->sptrsv_symbolic_completed) {
1387     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1388     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1389     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1390   }
1391   PetscFunctionReturn(0);
1392 }
1393 
1394 /* Check if we need to update factors etc for transpose solve */
1395 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1396 {
1397   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1398   MatColIdxType             n = A->rmap->n;
1399 
1400   PetscFunctionBegin;
1401   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1402     /* Update L^T and do sptrsv symbolic */
1403     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1404     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1405     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1406     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1407 
1408     KokkosKernels::Impl::transpose_matrix<
1409       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1410       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1411       MatRowMapKokkosView,DefaultExecutionSpace>(
1412         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1413         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1414 
1415     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1416       We have to sort the indices, until KK provides finer control options.
1417     */
1418     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1419       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1420         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1421 
1422     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1423 
1424     /* Update U^T and do sptrsv symbolic */
1425     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1426     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1427     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1428     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1429 
1430     KokkosKernels::Impl::transpose_matrix<
1431       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1432       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1433       MatRowMapKokkosView,DefaultExecutionSpace>(
1434         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1435         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1436 
1437     /* Sort indices. See comments above */
1438     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1439       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1440         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1441 
1442     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1443     factors->transpose_updated = PETSC_TRUE;
1444   }
1445   PetscFunctionReturn(0);
1446 }
1447 
1448 /* Solve Ax = b, with A = LU */
1449 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1450 {
1451   PetscErrorCode                 ierr;
1452   ConstPetscScalarKokkosView     bv;
1453   PetscScalarKokkosView          xv;
1454   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1455 
1456   PetscFunctionBegin;
1457   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1458   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1459   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1460   /* Solve L tmpv = b */
1461   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1462   /* Solve Ux = tmpv */
1463   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1464   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1465   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1466   PetscFunctionReturn(0);
1467 }
1468 
1469 /* Solve A^T x = b, where A^T = U^T L^T */
1470 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1471 {
1472   PetscErrorCode                 ierr;
1473   ConstPetscScalarKokkosView     bv;
1474   PetscScalarKokkosView          xv;
1475   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1476 
1477   PetscFunctionBegin;
1478   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1479   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1480   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1481   /* Solve U^T tmpv = b */
1482   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1483 
1484   /* Solve L^T x = tmpv */
1485   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1486   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1487   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1492 {
1493   PetscErrorCode                 ierr;
1494   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1495   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1496   PetscInt                       fill_lev = info->levels;
1497 
1498   PetscFunctionBegin;
1499   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1500 
1501   auto a_d = aijkok->a_dual.view_device();
1502   auto i_d = aijkok->i_dual.view_device();
1503   auto j_d = aijkok->j_dual.view_device();
1504 
1505   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1506 
1507   B->assembled                       = PETSC_TRUE;
1508   B->preallocated                    = PETSC_TRUE;
1509   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1510   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1511   B->ops->matsolve                   = NULL;
1512   B->ops->matsolvetranspose          = NULL;
1513   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1514 
1515   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1516   factors->transpose_updated         = PETSC_FALSE;
1517   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1522 {
1523   PetscErrorCode                 ierr;
1524   Mat_SeqAIJKokkos               *aijkok;
1525   Mat_SeqAIJ                     *b;
1526   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1527   PetscInt                       fill_lev = info->levels;
1528   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1529   PetscInt                       n = A->rmap->n;
1530 
1531   PetscFunctionBegin;
1532   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1533   /* Rebuild factors */
1534   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1535   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1536 
1537   /* Create a spiluk handle and then do symbolic factorization */
1538   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1539   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1540 
1541   auto spiluk_handle = factors->kh.get_spiluk_handle();
1542 
1543   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1544   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1545   Kokkos::realloc(factors->iU_d,n+1);
1546   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1547 
1548   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1549   auto i_d = aijkok->i_dual.view_device();
1550   auto j_d = aijkok->j_dual.view_device();
1551   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1552   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1553 
1554   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1555   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1556   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1557   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1558 
1559   /* TODO: add options to select sptrsv algorithms */
1560   /* Create sptrsv handles for L, U and their transpose */
1561  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1562   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1563  #else
1564   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1565  #endif
1566 
1567   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1568   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1569   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1570   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1571 
1572   /* Fill fields of the factor matrix B */
1573   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1574   b     = (Mat_SeqAIJ*)B->data;
1575   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1576   B->info.fill_ratio_given  = info->fill;
1577   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1578 
1579   B->offloadmask            = PETSC_OFFLOAD_GPU;
1580   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1585 {
1586   PetscErrorCode   ierr;
1587   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1588   const PetscInt   nrows   = A->rmap->n;
1589 
1590   PetscFunctionBegin;
1591   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1592   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1593   // move B data into Kokkos
1594   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1595   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1596   {
1597     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1598     if (!baijkok->diag_d) {
1599       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1600       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1601       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1602     }
1603   }
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1608 {
1609   PetscFunctionBegin;
1610   *type = MATSOLVERKOKKOS;
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1615 {
1616   PetscFunctionBegin;
1617   *type = MATSOLVERKOKKOSDEVICE;
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 /*MC
1622   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1623   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1624 
1625   Level: beginner
1626 
1627 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1628 M*/
1629 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1630 {
1631   PetscErrorCode ierr;
1632   PetscInt       n = A->rmap->n;
1633 
1634   PetscFunctionBegin;
1635   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1636   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1637   (*B)->factortype = ftype;
1638   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1639   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1640 
1641   if (ftype == MAT_FACTOR_LU) {
1642     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1643     (*B)->canuseordering         = PETSC_TRUE;
1644     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1645   } else if (ftype == MAT_FACTOR_ILU) {
1646     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1647     (*B)->canuseordering         = PETSC_FALSE;
1648     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1649   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1650 
1651   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1652   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1657 {
1658   PetscErrorCode ierr;
1659   PetscInt       n = A->rmap->n;
1660 
1661   PetscFunctionBegin;
1662   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1663   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1664   (*B)->factortype = ftype;
1665   (*B)->canuseordering = PETSC_TRUE;
1666   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1667   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1668 
1669   if (ftype == MAT_FACTOR_LU) {
1670     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1671     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1672   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1673 
1674   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1675   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1680 {
1681   PetscErrorCode ierr;
1682 
1683   PetscFunctionBegin;
1684   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1685   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1686   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 /* Utility to print out a KokkosCsrMatrix for debugging */
1691 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1692 {
1693   PetscErrorCode    ierr;
1694   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1695   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1696   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1697   const PetscInt    *i = iv.data();
1698   const PetscInt    *j = jv.data();
1699   const PetscScalar *a = av.data();
1700   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1701 
1702   PetscFunctionBegin;
1703   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1704   for (PetscInt k=0; k<m; k++) {
1705     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1706     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1707       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1708     }
1709     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1710   }
1711   PetscFunctionReturn(0);
1712 }
1713