xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7487cd7ca1dddf3cbc146be559ee2e39856c5efc)
1 #include <petscvec_kokkos.hpp>
2 #include <petscpkg_version.h>
3 #include <petsc/private/petscimpl.h>
4 #include <petsc/private/sfimpl.h>
5 #include <petscsystypes.h>
6 #include <petscerror.h>
7 
8 #include <Kokkos_Core.hpp>
9 #include <KokkosBlas.hpp>
10 #include <KokkosKernels_Sorting.hpp>
11 #include <KokkosSparse_CrsMatrix.hpp>
12 #include <KokkosSparse_spmv.hpp>
13 #include <KokkosSparse_spiluk.hpp>
14 #include <KokkosSparse_sptrsv.hpp>
15 #include <KokkosSparse_spgemm.hpp>
16 #include <KokkosSparse_spadd.hpp>
17 
18 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
19 
20 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
21 
22 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24    In the latter case, it is important to set a_dual's sync state correctly.
25  */
26 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
27 {
28   PetscErrorCode    ierr;
29   Mat_SeqAIJ        *aijseq;
30   Mat_SeqAIJKokkos  *aijkok;
31 
32   PetscFunctionBegin;
33   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
34   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35 
36   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38 
39   /* If aijkok does not exist, we just copy i, j to device.
40      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41      In both cases, we build a new aijkok structure.
42   */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44     delete aijkok;
45     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46     A->spptr = aijkok;
47   }
48 
49   if (aijkok && aijkok->device_mat_d.data()) {
50     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51   }
52   PetscFunctionReturn(0);
53 }
54 
55 /* Sync CSR data to device if not yet */
56 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
57 {
58   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
59 
60   PetscFunctionBegin;
61   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64   if (aijkok->a_dual.need_sync_device()) {
65     aijkok->a_dual.sync_device();
66     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
67     aijkok->hermitian_updated = PETSC_FALSE;
68   }
69   PetscFunctionReturn(0);
70 }
71 
72 /* Mark the CSR data on device as modified */
73 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
74 {
75   PetscErrorCode   ierr;
76   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
77 
78   PetscFunctionBegin;
79   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
80   aijkok->a_dual.clear_sync_state();
81   aijkok->a_dual.modify_device();
82   aijkok->transpose_updated = PETSC_FALSE;
83   aijkok->hermitian_updated = PETSC_FALSE;
84   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
85   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90 {
91   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92 
93   PetscFunctionBegin;
94   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
95    /* We do not expect one needs factors on host  */
96   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98   aijkok->a_dual.sync_host();
99   PetscFunctionReturn(0);
100 }
101 
102 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103 {
104   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105 
106   PetscFunctionBegin;
107   if (aijkok) {
108     aijkok->a_dual.sync_host();
109     *array = aijkok->a_dual.view_host().data();
110   } else { /* Happens when calling MatSetValues on a newly created matrix */
111     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112   }
113   PetscFunctionReturn(0);
114 }
115 
116 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117 {
118   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119 
120   PetscFunctionBegin;
121   if (aijkok) aijkok->a_dual.modify_host();
122   PetscFunctionReturn(0);
123 }
124 
125 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126 {
127   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128 
129   PetscFunctionBegin;
130   if (aijkok) {
131     aijkok->a_dual.sync_host();
132     *array = aijkok->a_dual.view_host().data();
133   } else {
134     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
135   }
136   PetscFunctionReturn(0);
137 }
138 
139 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140 {
141   PetscFunctionBegin;
142   *array = NULL;
143   PetscFunctionReturn(0);
144 }
145 
146 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147 {
148   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149 
150   PetscFunctionBegin;
151   if (aijkok) {
152     *array = aijkok->a_dual.view_host().data();
153   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
154     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160 {
161   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162 
163   PetscFunctionBegin;
164   if (aijkok) {
165     aijkok->a_dual.clear_sync_state();
166     aijkok->a_dual.modify_host();
167   }
168   PetscFunctionReturn(0);
169 }
170 
171 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173 {
174   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176 
177   PetscFunctionBegin;
178   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181   PetscFunctionReturn(0);
182 }
183 
184 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186 {
187   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188 
189   PetscFunctionBegin;
190   if (aijkok && aijkok->device_mat_d.data()) {
191     *d_mat = aijkok->device_mat_d.data();
192   } else {
193     PetscErrorCode   ierr;
194     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195     *d_mat  = NULL;
196   }
197   PetscFunctionReturn(0);
198 }
199 
200 /* Generate the transpose on device and cache it internally */
201 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202 {
203   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204 
205   PetscFunctionBegin;
206   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209     CHKERRCXX(aijkok->a_dual.sync_device());
210     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
212     aijkok->transpose_updated = PETSC_TRUE;
213   }
214   *csrmatT = &aijkok->csrmatT;
215   PetscFunctionReturn(0);
216 }
217 
218 /* Generate the Hermitian on device and cache it internally */
219 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220 {
221   PetscErrorCode                   ierr;
222   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223 
224   PetscFunctionBegin;
225   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228     CHKERRCXX(aijkok->a_dual.sync_device());
229     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231    #if defined(PETSC_USE_COMPLEX)
232     const auto& a = aijkok->csrmatH.values;
233     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234    #endif
235     aijkok->hermitian_updated = PETSC_TRUE;
236   }
237   *csrmatH = &aijkok->csrmatH;
238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 /* y = A x */
243 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
244 {
245   PetscErrorCode                   ierr;
246   Mat_SeqAIJKokkos                 *aijkok;
247   ConstPetscScalarKokkosView       xv;
248   PetscScalarKokkosView            yv;
249 
250   PetscFunctionBegin;
251   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
254   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
259   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
261   PetscFunctionReturn(0);
262 }
263 
264 /* y = A^T x */
265 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
266 {
267   PetscErrorCode                   ierr;
268   Mat_SeqAIJKokkos                 *aijkok;
269   const char                       *mode;
270   ConstPetscScalarKokkosView       xv;
271   PetscScalarKokkosView            yv;
272   KokkosCsrMatrix                  *csrmat;
273 
274   PetscFunctionBegin;
275   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
276   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
279   if (A->form_explicit_transpose) {
280     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281     mode = "N";
282   } else {
283     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284     csrmat = &aijkok->csrmat;
285     mode = "T";
286   }
287   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
290   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 /* y = A^H x */
296 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
297 {
298   PetscErrorCode                   ierr;
299   Mat_SeqAIJKokkos                 *aijkok;
300   const char                       *mode;
301   ConstPetscScalarKokkosView       xv;
302   PetscScalarKokkosView            yv;
303   KokkosCsrMatrix                  *csrmat;
304 
305   PetscFunctionBegin;
306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
307   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
310   if (A->form_explicit_transpose) {
311     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312     mode = "N";
313   } else {
314     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315     csrmat = &aijkok->csrmat;
316     mode = "C";
317   }
318   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
321   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
323   PetscFunctionReturn(0);
324 }
325 
326 /* z = A x + y */
327 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
328 {
329   PetscErrorCode                   ierr;
330   Mat_SeqAIJKokkos                 *aijkok;
331   ConstPetscScalarKokkosView       xv,yv;
332   PetscScalarKokkosView            zv;
333 
334   PetscFunctionBegin;
335   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
336   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
340   if (zz != yy) Kokkos::deep_copy(zv,yv);
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 /* z = A^T x + y */
352 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
353 {
354   PetscErrorCode                   ierr;
355   Mat_SeqAIJKokkos                 *aijkok;
356   const char                       *mode;
357   ConstPetscScalarKokkosView       xv,yv;
358   PetscScalarKokkosView            zv;
359   KokkosCsrMatrix                  *csrmat;
360 
361   PetscFunctionBegin;
362   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
363   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
367   if (zz != yy) Kokkos::deep_copy(zv,yv);
368   if (A->form_explicit_transpose) {
369     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370     mode = "N";
371   } else {
372     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373     csrmat = &aijkok->csrmat;
374     mode = "T";
375   }
376   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
380   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
382   PetscFunctionReturn(0);
383 }
384 
385 /* z = A^H x + y */
386 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
387 {
388   PetscErrorCode                   ierr;
389   Mat_SeqAIJKokkos                 *aijkok;
390   const char                       *mode;
391   ConstPetscScalarKokkosView       xv,yv;
392   PetscScalarKokkosView            zv;
393   KokkosCsrMatrix                  *csrmat;
394 
395   PetscFunctionBegin;
396   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
397   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
401   if (zz != yy) Kokkos::deep_copy(zv,yv);
402   if (A->form_explicit_transpose) {
403     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404     mode = "N";
405   } else {
406     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407     csrmat = &aijkok->csrmat;
408     mode = "C";
409   }
410   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
414   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416   PetscFunctionReturn(0);
417 }
418 
419 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420 {
421   PetscErrorCode            ierr;
422   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423 
424   PetscFunctionBegin;
425   switch (op) {
426     case MAT_FORM_EXPLICIT_TRANSPOSE:
427       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429       A->form_explicit_transpose = flg;
430       break;
431     default:
432       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433       break;
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 /* Depending on reuse, either build a new mat, or use the existing mat */
439 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
440 {
441   PetscErrorCode   ierr;
442   Mat_SeqAIJ       *aseq;
443 
444   PetscFunctionBegin;
445   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
448   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
458       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
460     }
461   }
462   PetscFunctionReturn(0);
463 }
464 
465 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467  */
468 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
469 {
470   PetscErrorCode        ierr;
471   Mat_SeqAIJ            *bseq;
472   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473   Mat                   mat;
474 
475   PetscFunctionBegin;
476   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478   mat  = *B;
479   if (A->assembled) {
480     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483     /* Now copy values to B if needed */
484     if (dupOption == MAT_COPY_VALUES) {
485       if (akok->a_dual.need_sync_device()) {
486         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487         bkok->a_dual.modify_host();
488       } else { /* If device has the latest data, we only copy data on device */
489         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490         bkok->a_dual.modify_device();
491       }
492     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494       bkok->a_dual.modify_host();
495     }
496     mat->spptr = bkok;
497   }
498 
499   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
507 {
508   PetscErrorCode    ierr;
509   Mat               At;
510   KokkosCsrMatrix   *internT;
511   Mat_SeqAIJKokkos  *atkok,*bkok;
512 
513   PetscFunctionBegin;
514   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
515   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516     /* Deep copy internT, as we want to isolate the internal transpose */
517     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
518     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
519     if (reuse == MAT_INITIAL_MATRIX) *B = At;
520     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
521   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
522     if ((*B)->assembled) {
523       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
524       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
525       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
526     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
527       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
528       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
529       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
530       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
531       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
532     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
533   }
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
538 {
539   PetscErrorCode             ierr;
540   Mat_SeqAIJKokkos           *aijkok;
541 
542   PetscFunctionBegin;
543   if (A->factortype == MAT_FACTOR_NONE) {
544     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545     delete aijkok;
546   } else {
547     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
548   }
549   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
550   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
551   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
552   A->spptr = NULL;
553   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
558 {
559   PetscErrorCode ierr;
560 
561   PetscFunctionBegin;
562   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
563   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
564   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
565   PetscFunctionReturn(0);
566 }
567 
568 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
569 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
570 {
571   PetscErrorCode               ierr;
572   Mat_SeqAIJ                   *a,*b;
573   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
574   MatScalarKokkosView          aa,ba,ca;
575   MatRowMapKokkosView          ai,bi,ci;
576   MatColIdxKokkosView          aj,bj,cj;
577   PetscInt                     m,n,nnz,aN;
578 
579   PetscFunctionBegin;
580   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
581   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
582   PetscValidPointer(C,4);
583   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
584   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
585   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
586   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
587 
588   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
589   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
590   a    = static_cast<Mat_SeqAIJ*>(A->data);
591   b    = static_cast<Mat_SeqAIJ*>(B->data);
592   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
593   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
594   aa   = akok->a_dual.view_device();
595   ai   = akok->i_dual.view_device();
596   ba   = bkok->a_dual.view_device();
597   bi   = bkok->i_dual.view_device();
598   m    = A->rmap->n; /* M, N and nnz of C */
599   n    = A->cmap->n + B->cmap->n;
600   nnz  = a->nz + b->nz;
601   aN   = A->cmap->n; /* N of A */
602   if (reuse == MAT_INITIAL_MATRIX) {
603     aj = akok->j_dual.view_device();
604     bj = bkok->j_dual.view_device();
605     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
606     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
607     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
608     ca = ca_dual.view_device();
609     ci = ci_dual.view_device();
610     cj = cj_dual.view_device();
611 
612     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
613     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
614       PetscInt i = t.league_rank(); /* row i */
615       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
616 
617       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618         ci(i) = coffset;
619         if (i == m-1) ci(m) = ai(m) + bi(m);
620       });
621 
622       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
623         if (k < alen) {
624           ca(coffset+k) = aa(ai(i)+k);
625           cj(coffset+k) = aj(ai(i)+k);
626         } else {
627           ca(coffset+k) = ba(bi(i)+k-alen);
628           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
629         }
630       });
631     });
632     ca_dual.modify_device();
633     ci_dual.modify_device();
634     cj_dual.modify_device();
635     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
636     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
637   } else if (reuse == MAT_REUSE_MATRIX) {
638     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
639     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
640     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
641     ca   = ckok->a_dual.view_device();
642     ci   = ckok->i_dual.view_device();
643 
644     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
645       PetscInt i = t.league_rank(); /* row i */
646       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
647       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
648         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
649         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
650       });
651     });
652     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
653   }
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
658 {
659   PetscFunctionBegin;
660   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
661   PetscFunctionReturn(0);
662 }
663 
664 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665 {
666   PetscErrorCode                 ierr;
667   Mat_Product                    *product = C->product;
668   Mat                            A,B;
669   bool                           transA,transB; /* use bool, since KK needs this type */
670   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
671   Mat_SeqAIJ                     *c;
672   MatProductData_SeqAIJKokkos    *pdata;
673   KokkosCsrMatrix                *csrmatA,*csrmatB;
674 
675   PetscFunctionBegin;
676   MatCheckProduct(C,1);
677   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
678   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
679 
680   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
681     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
682     PetscFunctionReturn(0);
683   }
684 
685   switch (product->type) {
686     case MATPRODUCT_AB:  transA = false; transB = false; break;
687     case MATPRODUCT_AtB: transA = true;  transB = false; break;
688     case MATPRODUCT_ABt: transA = false; transB = true;  break;
689     default:
690       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
691   }
692 
693   A     = product->A;
694   B     = product->B;
695   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
696   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
697   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
698   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
699   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
700 
701   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
702 
703   csrmatA = &akok->csrmat;
704   csrmatB = &bkok->csrmat;
705 
706   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
707   if (transA) {
708     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
709     transA = false;
710   }
711 
712   if (transB) {
713     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
714     transB = false;
715   }
716   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
717   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
718   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
719   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
720   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
721   /* shorter version of MatAssemblyEnd_SeqAIJ */
722   c = (Mat_SeqAIJ*)C->data;
723   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
724   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
725   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
726   c->reallocs         = 0;
727   C->info.mallocs     = 0;
728   C->info.nz_unneeded = 0;
729   C->assembled        = C->was_assembled = PETSC_TRUE;
730   C->num_ass++;
731   PetscFunctionReturn(0);
732 }
733 
734 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
735 {
736   PetscErrorCode                 ierr;
737   Mat_Product                    *product = C->product;
738   MatProductType                 ptype;
739   Mat                            A,B;
740   bool                           transA,transB;
741   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
742   MatProductData_SeqAIJKokkos    *pdata;
743   MPI_Comm                       comm;
744   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
745 
746   PetscFunctionBegin;
747   MatCheckProduct(C,1);
748   ierr = PetscObjectGetComm((PetscObject)C,&comm);
749   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
750   A       = product->A;
751   B       = product->B;
752   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
753   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
754   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
755   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
756   csrmatA = &akok->csrmat;
757   csrmatB = &bkok->csrmat;
758 
759   ptype   = product->type;
760   switch (ptype) {
761     case MATPRODUCT_AB:  transA = false; transB = false; break;
762     case MATPRODUCT_AtB: transA = true;  transB = false; break;
763     case MATPRODUCT_ABt: transA = false; transB = true;  break;
764     default:
765       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
766   }
767 
768   product->data = pdata = new MatProductData_SeqAIJKokkos();
769   pdata->kh.set_team_work_size(16);
770   pdata->kh.set_dynamic_scheduling(true);
771   pdata->reusesym = product->api_user;
772 
773   /* TODO: add command line options to select spgemm algorithms */
774   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
775   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
776      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
777    */
778   pdata->kh.create_spgemm_handle(spgemm_alg);
779 
780   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
781   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
782   if (transA) {
783     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
784     transA = false;
785   }
786 
787   if (transB) {
788     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
789     transB = false;
790   }
791 
792   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
793   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
794     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
795     calling new Mat_SeqAIJKokkos().
796     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
797   */
798   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
799   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
800   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
801 
802   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
803   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
804   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
805   PetscFunctionReturn(0);
806 }
807 
808 /* handles sparse matrix matrix ops */
809 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
810 {
811   PetscErrorCode ierr;
812   Mat_Product    *product = mat->product;
813   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
814 
815   PetscFunctionBegin;
816   MatCheckProduct(mat,1);
817   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
818   if (product->type == MATPRODUCT_ABC) {
819     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
820   }
821   if (Biskok && Ciskok) {
822     switch (product->type) {
823     case MATPRODUCT_AB:
824     case MATPRODUCT_AtB:
825     case MATPRODUCT_ABt:
826       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
827       break;
828     case MATPRODUCT_PtAP:
829     case MATPRODUCT_RARt:
830     case MATPRODUCT_ABC:
831       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
832       break;
833     default:
834       break;
835     }
836   } else { /* fallback for AIJ */
837     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
838   }
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
843 {
844   PetscErrorCode   ierr;
845   Mat_SeqAIJKokkos *aijkok;
846 
847   PetscFunctionBegin;
848   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
849   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
850   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
851   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
852   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
853   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
854   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
859 {
860   PetscErrorCode   ierr;
861   Mat_SeqAIJKokkos *aijkok;
862 
863   PetscFunctionBegin;
864   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
865   if (aijkok) { /* Only zero the device if data is already there */
866     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
867     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
868   } else { /* Might be preallocated but not assembled */
869     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
870   }
871   PetscFunctionReturn(0);
872 }
873 
874 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
875 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
876 {
877   PetscErrorCode     ierr;
878   Mat_SeqAIJKokkos   *aijkok;
879 
880   PetscFunctionBegin;
881   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
882   PetscValidPointer(kv,2);
883   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
884   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
885   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
886   *kv    = aijkok->a_dual.view_device();
887   PetscFunctionReturn(0);
888 }
889 
890 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
891 {
892   PetscFunctionBegin;
893   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
894   PetscValidPointer(kv,2);
895   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
896   PetscFunctionReturn(0);
897 }
898 
899 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
900 {
901   PetscErrorCode     ierr;
902   Mat_SeqAIJKokkos   *aijkok;
903 
904   PetscFunctionBegin;
905   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
906   PetscValidPointer(kv,2);
907   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
908   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
909   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
910   *kv    = aijkok->a_dual.view_device();
911   PetscFunctionReturn(0);
912 }
913 
914 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
915 {
916   PetscErrorCode     ierr;
917 
918   PetscFunctionBegin;
919   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
920   PetscValidPointer(kv,2);
921   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
922   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
923   PetscFunctionReturn(0);
924 }
925 
926 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
927 {
928   Mat_SeqAIJKokkos   *aijkok;
929 
930   PetscFunctionBegin;
931   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
932   PetscValidPointer(kv,2);
933   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
934   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
935   *kv    = aijkok->a_dual.view_device();
936   PetscFunctionReturn(0);
937 }
938 
939 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
940 {
941   PetscErrorCode     ierr;
942 
943   PetscFunctionBegin;
944   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945   PetscValidPointer(kv,2);
946   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
948   PetscFunctionReturn(0);
949 }
950 
951 /* Computes Y += alpha X */
952 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953 {
954   PetscErrorCode             ierr;
955   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
956   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
957   ConstMatScalarKokkosView   Xa;
958   MatScalarKokkosView        Ya;
959 
960   PetscFunctionBegin;
961   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
962   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
963   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
964   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
965   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
966 
967   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
968     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
969     PetscBool e;
970     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
971     if (e) {
972       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
973       if (e) pattern = SAME_NONZERO_PATTERN;
974     }
975   }
976 
977   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
978     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
979     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
980     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
981   */
982   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
983   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
984   Xa   = xkok->a_dual.view_device();
985   Ya   = ykok->a_dual.view_device();
986 
987   if (pattern == SAME_NONZERO_PATTERN) {
988     KokkosBlas::axpy(alpha,Xa,Ya);
989     ierr = MatSeqAIJKokkosModifyDevice(Y);
990   } else if (pattern == SUBSET_NONZERO_PATTERN) {
991     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
992     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
993 
994     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
995       PetscInt i = t.league_rank(); /* row i */
996       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
997         PetscInt p,q = Yi(i);
998         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
999           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1000           if (Xj(p) == Yj(q)) { /* Found it */
1001             Ya(q) += alpha * Xa(p);
1002             q++;
1003           } else {
1004             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1005                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1006             */
1007             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1008           }
1009         }
1010       });
1011     });
1012     ierr = MatSeqAIJKokkosModifyDevice(Y);
1013   } else { /* different nonzero patterns */
1014     Mat             Z;
1015     KokkosCsrMatrix zcsr;
1016     KernelHandle    kh;
1017     kh.create_spadd_handle(false);
1018     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1019     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1020     zkok = new Mat_SeqAIJKokkos(zcsr);
1021     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1022     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1023     kh.destroy_spadd_handle();
1024   }
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
1031 {
1032   PetscErrorCode            ierr;
1033   Mat                       newmat;
1034   Mat_SeqAIJKokkos          *akok;
1035   Mat_SeqAIJ                *aseq;
1036 
1037   PetscFunctionBegin;
1038   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1039   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1040   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1041   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1042   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1043   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1044   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1045   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
1046   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1047   akok->SetUpCOO(aseq);
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1052 {
1053   PetscErrorCode              ierr;
1054   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1055   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1056   PetscCount                  Annz = aseq->nz;
1057   const PetscCountKokkosView& jmap = akok->jmap_d;
1058   const PetscCountKokkosView& perm = akok->perm_d;
1059   MatScalarKokkosView         Aa;
1060   ConstMatScalarKokkosView    kv;
1061   PetscMemType                memtype;
1062 
1063   PetscFunctionBegin;
1064   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
1065   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1066   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1067     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
1068   } else {
1069     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
1070   }
1071 
1072   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1073   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
1074 
1075   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1076     PetscScalar sum = 0.0;
1077     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1078     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1079   });
1080 
1081   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1082   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
1083   PetscFunctionReturn(0);
1084 }
1085 
1086 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1087 {
1088   PetscErrorCode ierr;
1089 
1090   PetscFunctionBegin;
1091   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
1092   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1093   B->offloadmask = PETSC_OFFLOAD_CPU;
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1098 {
1099   PetscErrorCode     ierr;
1100   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1101 
1102   PetscFunctionBegin;
1103   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1104   A->boundtocpu  = PETSC_FALSE;
1105 
1106   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1107   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1108   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1109   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1110   A->ops->scale                     = MatScale_SeqAIJKokkos;
1111   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1112   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1113   A->ops->mult                      = MatMult_SeqAIJKokkos;
1114   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1115   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1116   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1117   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1118   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1119   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1120   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1121   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1122   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1123   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1124   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1125   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1126   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1127   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1128 
1129   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1130   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1135 {
1136   PetscErrorCode ierr;
1137   Mat_SeqAIJ     *aseq;
1138   PetscInt       i,m,n;
1139 
1140   PetscFunctionBegin;
1141   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1142 
1143   m    = akok->nrows();
1144   n    = akok->ncols();
1145   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1146   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1147 
1148   /* Set up data structures of A as a MATSEQAIJ */
1149   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1150   aseq = (Mat_SeqAIJ*)(A)->data;
1151 
1152   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1153   akok->j_dual.sync_host();
1154 
1155   aseq->i            = akok->i_host_data();
1156   aseq->j            = akok->j_host_data();
1157   aseq->a            = akok->a_host_data();
1158   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1159   aseq->singlemalloc = PETSC_FALSE;
1160   aseq->free_a       = PETSC_FALSE;
1161   aseq->free_ij      = PETSC_FALSE;
1162   aseq->nz           = akok->nnz();
1163   aseq->maxnz        = aseq->nz;
1164 
1165   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1166   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1167   for (i=0; i<m; i++) {
1168     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1169   }
1170 
1171   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1172   akok->nonzerostate = A->nonzerostate;
1173   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1174   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1175   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1180 
1181    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1182  */
1183 PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1184 {
1185   PetscErrorCode ierr;
1186 
1187   PetscFunctionBegin;
1188   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1189   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
1190   PetscFunctionReturn(0);
1191 }
1192 
1193 /* --------------------------------------------------------------------------------*/
1194 /*@C
1195    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1196    (the default parallel PETSc format). This matrix will ultimately be handled by
1197    Kokkos for calculations. For good matrix
1198    assembly performance the user should preallocate the matrix storage by setting
1199    the parameter nz (or the array nnz).  By setting these parameters accurately,
1200    performance during matrix assembly can be increased by more than a factor of 50.
1201 
1202    Collective
1203 
1204    Input Parameters:
1205 +  comm - MPI communicator, set to PETSC_COMM_SELF
1206 .  m - number of rows
1207 .  n - number of columns
1208 .  nz - number of nonzeros per row (same for all rows)
1209 -  nnz - array containing the number of nonzeros in the various rows
1210          (possibly different for each row) or NULL
1211 
1212    Output Parameter:
1213 .  A - the matrix
1214 
1215    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1216    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1217    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1218 
1219    Notes:
1220    If nnz is given then nz is ignored
1221 
1222    The AIJ format (also called the Yale sparse matrix format or
1223    compressed row storage), is fully compatible with standard Fortran 77
1224    storage.  That is, the stored row and column indices can begin at
1225    either one (as in Fortran) or zero.  See the users' manual for details.
1226 
1227    Specify the preallocated storage with either nz or nnz (not both).
1228    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1229    allocation.  For large problems you MUST preallocate memory or you
1230    will get TERRIBLE performance, see the users' manual chapter on matrices.
1231 
1232    By default, this format uses inodes (identical nodes) when possible, to
1233    improve numerical efficiency of matrix-vector products and solves. We
1234    search for consecutive rows with the same nonzero structure, thereby
1235    reusing matrix information to achieve increased efficiency.
1236 
1237    Level: intermediate
1238 
1239 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
1240 @*/
1241 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1242 {
1243   PetscErrorCode ierr;
1244 
1245   PetscFunctionBegin;
1246   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1247   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1248   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1249   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1250   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 typedef Kokkos::TeamPolicy<>::member_type team_member;
1255 //
1256 // This factorization exploits block diagonal matrices with "Nf" (not used).
1257 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
1258 //
1259 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1260 {
1261   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
1262   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1263   IS                 isrow = b->row,isicol = b->icol;
1264   PetscErrorCode     ierr;
1265   const PetscInt     *r_h,*ic_h;
1266   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1267   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1268   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
1269   PetscBool          row_identity,col_identity;
1270   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1271 
1272   PetscFunctionBegin;
1273   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1274   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
1275   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1276   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
1277   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
1278   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
1279   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1280   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1281   {
1282 #define KOKKOS_SHARED_LEVEL 1
1283     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
1284     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
1285     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
1286     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
1287     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
1288     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
1289     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
1290     size_t flops_h = 0.0;
1291     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
1292     Kokkos::View<size_t> d_flops_k ("flops");
1293     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
1294     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
1295     Kokkos::deep_copy (d_flops_k, h_flops_k);
1296     Kokkos::deep_copy (d_r_k, h_r_k);
1297     Kokkos::deep_copy (d_ic_k, h_ic_k);
1298     // Fill A --> fact
1299     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1300         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
1301         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
1302         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
1303         // zero rows of B
1304         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1305             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
1306             PetscScalar *baL = ba_d + bi_d[rowb];
1307             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
1308             /* zero (unfactored row) */
1309             for (int j=0;j<nzbL;j++) baL[j] = 0;
1310             for (int j=0;j<nzbU;j++) baU[j] = 0;
1311           });
1312         // copy A into B
1313         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
1314             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
1315             const PetscScalar *av    = aa_d + ai_d[rowa];
1316             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
1317             /* load in initial (unfactored row) */
1318             for (int j=0;j<nza;j++) {
1319               PetscInt    colb = ic[ajtmp[j]];
1320               PetscScalar vala = av[j];
1321               if (colb == rowb) {
1322                 *(ba_d + bdiag_d[rowb]) = vala;
1323               } else {
1324                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1325                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
1326                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1327                 for (int j=0; j<nz ; j++) {
1328                   if (pbj[j] == colb) {
1329                     pba[j] = vala;
1330                     set++;
1331                     break;
1332                   }
1333                 }
1334                #if !defined(PETSC_HAVE_SYCL)
1335                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1336                #endif
1337               }
1338             }
1339           });
1340       });
1341     Kokkos::fence();
1342 
1343     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1344         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1345         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1346         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1347         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1348         const PetscInt  start = field*nloc, end = start + nloc;
1349         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1350         // A22 panel update for each row A(1,:) and col A(:,1)
1351         for (int ii=start; ii<end-1; ii++) {
1352           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1353           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1354           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1355           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1356           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1357               PetscInt kIdx = j*Ni + field_block_idx;
1358               if (kIdx >= nzUi) /* void */ ;
1359               else {
1360                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1361                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1362                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1363                 size_t         st_idx;
1364                 // find and do L(k,i) = A(:k,i) / A(i,i)
1365                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1366                 // get column, there has got to be a better way
1367                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1368                     if (pjL[j] == ii) {
1369                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1370                       idx = j; // output
1371                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1372                     }
1373                 }, st_idx);
1374                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1375 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1376                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
1377 #endif
1378                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1379                 // U(i+1,:end)
1380                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1381                       PetscScalar Uij = baUi[uiIdx];
1382                       PetscInt    col = bjUi[uiIdx];
1383                       if (col==myk) {
1384                         // A_kk = A_kk - L_ki * U_ij(k)
1385                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1386                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1387                       } else {
1388                         PetscScalar    *start, *end, *pAkjv=NULL;
1389                         PetscInt       high, low;
1390                         const PetscInt *startj;
1391                         if (col<myk) { // L
1392                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1393                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1394                           start = pLki+1; // start at pLki+1, A22(myk,1)
1395                           startj= bj_d + bi_d[myk] + idx;
1396                           end   = ba_d + bi_d[myk+1];
1397                         } else {
1398                           PetscInt idx = bdiag_d[myk+1]+1;
1399                           start = ba_d + idx;
1400                           startj= bj_d + idx;
1401                           end   = ba_d + bdiag_d[myk];
1402                         }
1403                         // search for 'col', use bisection search - TODO
1404                         low  = 0;
1405                         high = (PetscInt)(end-start);
1406                         while (high-low > 5) {
1407                           int t = (low+high)/2;
1408                           if (startj[t] > col) high = t;
1409                           else                 low  = t;
1410                         }
1411                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1412                           if (startj[pAkjv-start] == col) break;
1413                         }
1414 #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
1415                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
1416 #endif
1417                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1418                       }
1419                     });
1420               }
1421             });
1422           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1423           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1424         } /* endof for (i=0; i<n; i++) { */
1425         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1426       });
1427     Kokkos::fence();
1428     Kokkos::deep_copy (h_flops_k, d_flops_k);
1429     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1430     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1431         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1432         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1433         /* Invert diagonal for simpler triangular solves */
1434         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1435             int i = start + outer_index*Ni + lg_rank%Ni;
1436             if (i < end) {
1437               PetscScalar *pv = ba_d + bdiag_d[i];
1438               *pv = 1.0/(*pv);
1439             }
1440           });
1441       });
1442   }
1443   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1444   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1445   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1446 
1447   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1448   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1449   if (b->inode.size) {
1450     B->ops->solve = MatSolve_SeqAIJ_Inode;
1451   } else if (row_identity && col_identity) {
1452     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1453   } else {
1454     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1455   }
1456   B->offloadmask = PETSC_OFFLOAD_GPU;
1457   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1458   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1459   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1460   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1461   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1462   B->assembled              = PETSC_TRUE;
1463   B->preallocated           = PETSC_TRUE;
1464 
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1469 {
1470   PetscErrorCode   ierr;
1471 
1472   PetscFunctionBegin;
1473   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1474   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1479 {
1480   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1481 
1482   PetscFunctionBegin;
1483   if (!factors->sptrsv_symbolic_completed) {
1484     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1485     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1486     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1487   }
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 /* Check if we need to update factors etc for transpose solve */
1492 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1493 {
1494   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1495   MatColIdxType             n = A->rmap->n;
1496 
1497   PetscFunctionBegin;
1498   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1499     /* Update L^T and do sptrsv symbolic */
1500     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
1501     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1502     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1503     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1504 
1505     KokkosKernels::Impl::transpose_matrix<
1506       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1507       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1508       MatRowMapKokkosView,DefaultExecutionSpace>(
1509         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1510         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1511 
1512     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1513       We have to sort the indices, until KK provides finer control options.
1514     */
1515     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1516       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1517         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1518 
1519     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1520 
1521     /* Update U^T and do sptrsv symbolic */
1522     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
1523     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1524     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1525     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1526 
1527     KokkosKernels::Impl::transpose_matrix<
1528       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1529       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1530       MatRowMapKokkosView,DefaultExecutionSpace>(
1531         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1532         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1533 
1534     /* Sort indices. See comments above */
1535     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1536       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
1537         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1538 
1539     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1540     factors->transpose_updated = PETSC_TRUE;
1541   }
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 /* Solve Ax = b, with A = LU */
1546 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1547 {
1548   PetscErrorCode                 ierr;
1549   ConstPetscScalarKokkosView     bv;
1550   PetscScalarKokkosView          xv;
1551   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1552 
1553   PetscFunctionBegin;
1554   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1555   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1556   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1557   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1558   /* Solve L tmpv = b */
1559   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1560   /* Solve Ux = tmpv */
1561   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1562   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1563   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1564   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 /* Solve A^T x = b, where A^T = U^T L^T */
1569 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1570 {
1571   PetscErrorCode                 ierr;
1572   ConstPetscScalarKokkosView     bv;
1573   PetscScalarKokkosView          xv;
1574   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1575 
1576   PetscFunctionBegin;
1577   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1578   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1579   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1580   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1581   /* Solve U^T tmpv = b */
1582   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1583 
1584   /* Solve L^T x = tmpv */
1585   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1586   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1587   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1588   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1593 {
1594   PetscErrorCode                 ierr;
1595   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1596   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1597   PetscInt                       fill_lev = info->levels;
1598 
1599   PetscFunctionBegin;
1600   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1601   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1602 
1603   auto a_d = aijkok->a_dual.view_device();
1604   auto i_d = aijkok->i_dual.view_device();
1605   auto j_d = aijkok->j_dual.view_device();
1606 
1607   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1608 
1609   B->assembled                       = PETSC_TRUE;
1610   B->preallocated                    = PETSC_TRUE;
1611   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1612   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1613   B->ops->matsolve                   = NULL;
1614   B->ops->matsolvetranspose          = NULL;
1615   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1616 
1617   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1618   factors->transpose_updated         = PETSC_FALSE;
1619   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1620   /* TODO: log flops, but how to know that? */
1621   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1626 {
1627   PetscErrorCode                 ierr;
1628   Mat_SeqAIJKokkos               *aijkok;
1629   Mat_SeqAIJ                     *b;
1630   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1631   PetscInt                       fill_lev = info->levels;
1632   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1633   PetscInt                       n = A->rmap->n;
1634 
1635   PetscFunctionBegin;
1636   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1637   /* Rebuild factors */
1638   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1639   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1640 
1641   /* Create a spiluk handle and then do symbolic factorization */
1642   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1643   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1644 
1645   auto spiluk_handle = factors->kh.get_spiluk_handle();
1646 
1647   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1648   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1649   Kokkos::realloc(factors->iU_d,n+1);
1650   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1651 
1652   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1653   auto i_d = aijkok->i_dual.view_device();
1654   auto j_d = aijkok->j_dual.view_device();
1655   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1656   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1657 
1658   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1659   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1660   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1661   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1662 
1663   /* TODO: add options to select sptrsv algorithms */
1664   /* Create sptrsv handles for L, U and their transpose */
1665  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1666   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1667  #else
1668   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1669  #endif
1670 
1671   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1672   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1673   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1674   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1675 
1676   /* Fill fields of the factor matrix B */
1677   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1678   b     = (Mat_SeqAIJ*)B->data;
1679   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1680   B->info.fill_ratio_given  = info->fill;
1681   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1682 
1683   B->offloadmask            = PETSC_OFFLOAD_GPU;
1684   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1685   PetscFunctionReturn(0);
1686 }
1687 
1688 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1689 {
1690   PetscErrorCode   ierr;
1691   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1692   const PetscInt   nrows   = A->rmap->n;
1693 
1694   PetscFunctionBegin;
1695   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1696   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1697   // move B data into Kokkos
1698   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1699   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1700   {
1701     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1702     if (!baijkok->diag_d.extent(0)) {
1703       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1704       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1705       Kokkos::deep_copy (baijkok->diag_d, h_diag);
1706     }
1707   }
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1712 {
1713   PetscFunctionBegin;
1714   *type = MATSOLVERKOKKOS;
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1719 {
1720   PetscFunctionBegin;
1721   *type = MATSOLVERKOKKOSDEVICE;
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 /*MC
1726   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1727   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1728 
1729   Level: beginner
1730 
1731 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1732 M*/
1733 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1734 {
1735   PetscErrorCode ierr;
1736   PetscInt       n = A->rmap->n;
1737 
1738   PetscFunctionBegin;
1739   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1740   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1741   (*B)->factortype = ftype;
1742   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1743   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1744 
1745   if (ftype == MAT_FACTOR_LU) {
1746     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1747     (*B)->canuseordering         = PETSC_TRUE;
1748     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1749   } else if (ftype == MAT_FACTOR_ILU) {
1750     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1751     (*B)->canuseordering         = PETSC_FALSE;
1752     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1753   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1754 
1755   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1756   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1761 {
1762   PetscErrorCode ierr;
1763   PetscInt       n = A->rmap->n;
1764 
1765   PetscFunctionBegin;
1766   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1767   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1768   (*B)->factortype = ftype;
1769   (*B)->canuseordering = PETSC_TRUE;
1770   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1771   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1772 
1773   if (ftype == MAT_FACTOR_LU) {
1774     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1775     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1776   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1777 
1778   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1779   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1780   PetscFunctionReturn(0);
1781 }
1782 
1783 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1784 {
1785   PetscErrorCode ierr;
1786 
1787   PetscFunctionBegin;
1788   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1789   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1790   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 /* Utility to print out a KokkosCsrMatrix for debugging */
1795 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1796 {
1797   PetscErrorCode    ierr;
1798   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1799   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1800   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1801   const PetscInt    *i = iv.data();
1802   const PetscInt    *j = jv.data();
1803   const PetscScalar *a = av.data();
1804   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1805 
1806   PetscFunctionBegin;
1807   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1808   for (PetscInt k=0; k<m; k++) {
1809     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1810     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1811       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1812     }
1813     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1814   }
1815   PetscFunctionReturn(0);
1816 }
1817