xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7d5fd1e4d9337468ad3f05b65b7facdcd2dfd2a4)
1 #include <petscvec_kokkos.hpp>
2 #include <petsc/private/petscimpl.h>
3 #include <petscsystypes.h>
4 #include <petscerror.h>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spiluk.hpp>
11 #include <KokkosSparse_sptrsv.hpp>
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
15 
16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
17 
18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
19 {
20   PetscErrorCode    ierr;
21   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
22 
23   PetscFunctionBegin;
24   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25   A->offloadmask = PETSC_OFFLOAD_CPU;
26   if (aijkok && aijkok->device_mat_d.data()) {
27     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
28   }
29   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
30   PetscFunctionReturn(0);
31 }
32 
33 /* Sync CSR data to device if not yet */
34 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
35 {
36   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
39 
40   PetscFunctionBegin;
41   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
42   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
43   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
44     delete aijkok;
45     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
46     aijkok->nonzerostate = A->nonzerostate;
47     A->spptr             = aijkok;
48   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
49     aijkok->a_dual.clear_sync_state();
50     aijkok->a_dual.modify_host(); /* Mark host is modified */
51     aijkok->a_dual.sync_device(); /* Sync the device */
52     aijkok->transpose_updated = PETSC_FALSE;
53     aijkok->hermitian_updated = PETSC_FALSE;
54   }
55   A->offloadmask = PETSC_OFFLOAD_BOTH;
56   PetscFunctionReturn(0);
57 }
58 
59 /* Mark the CSR data on device is modified */
60 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
61 {
62   PetscErrorCode   ierr;
63   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
64 
65   PetscFunctionBegin;
66   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
67   aijkok->a_dual.clear_sync_state();
68   aijkok->a_dual.modify_device();
69   aijkok->transpose_updated = PETSC_FALSE;
70   aijkok->hermitian_updated = PETSC_FALSE;
71   A->offloadmask = PETSC_OFFLOAD_GPU;
72   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
76 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
77 {
78   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
79 
80   PetscFunctionBegin;
81   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
82    /* We do not expect one needs factors on host  */
83   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
84   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
85     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
86     aijkok->a_dual.clear_sync_state();
87     aijkok->a_dual.modify_device(); /* Mark device is modified */
88     aijkok->a_dual.sync_host(); /* Sync the host */
89     A->offloadmask = PETSC_OFFLOAD_BOTH;
90   }
91   PetscFunctionReturn(0);
92 }
93 
94 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
95 {
96   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
97   PetscErrorCode ierr;
98 
99   PetscFunctionBegin;
100   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
101   *array = a->a;
102   A->offloadmask = PETSC_OFFLOAD_CPU;
103   PetscFunctionReturn(0);
104 }
105 
106 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
107 PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
108 {
109   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
110   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
111 
112   PetscFunctionBegin;
113   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
114   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
115   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
116   PetscFunctionReturn(0);
117 }
118 
119 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
120 PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
121 {
122   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
123 
124   PetscFunctionBegin;
125   if (aijkok && aijkok->device_mat_d.data()) {
126     *d_mat = aijkok->device_mat_d.data();
127   } else {
128     PetscErrorCode   ierr;
129     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
130     *d_mat  = NULL;
131   }
132   PetscFunctionReturn(0);
133 }
134 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
135 {
136   PetscErrorCode                   ierr;
137   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
138 
139   PetscFunctionBegin;
140   if (!aijkok->At) { /* Generate At for the first time */
141     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
142     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
143   } else if (!aijkok->transpose_updated) { /* Only update At values */
144     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
145     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
146   }
147   aijkok->transpose_updated = PETSC_TRUE;
148   PetscFunctionReturn(0);
149 }
150 
151 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
152 {
153   PetscErrorCode                   ierr;
154   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
155 
156   PetscFunctionBegin;
157   if (!aijkok->Ah) { /* Generate Ah for the first time */
158     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
159     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
160     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
161   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
162     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
163     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
164     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
165   }
166   aijkok->hermitian_updated = PETSC_TRUE;
167   PetscFunctionReturn(0);
168 }
169 
170 /* y = A x */
171 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
172 {
173   PetscErrorCode                   ierr;
174   Mat_SeqAIJKokkos                 *aijkok;
175   ConstPetscScalarKokkosView       xv;
176   PetscScalarKokkosView            yv;
177 
178   PetscFunctionBegin;
179   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
180   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
181   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
182   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
183   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
184   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
185   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
186   ierr   = WaitForKokkos();CHKERRQ(ierr);
187   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
188   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 }
191 
192 /* y = A^T x */
193 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
194 {
195   PetscErrorCode                   ierr;
196   Mat_SeqAIJKokkos                 *aijkok;
197   Mat                              B;
198   const char                       *mode;
199   ConstPetscScalarKokkosView       xv;
200   PetscScalarKokkosView            yv;
201 
202   PetscFunctionBegin;
203   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
204   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
205   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
206   if (A->form_explicit_transpose) {
207     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
208     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
209     mode = "N";
210   } else {
211     B    = A;
212     mode = "T";
213   }
214   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
215   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
216   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
217   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
218   ierr = WaitForKokkos();CHKERRQ(ierr);
219   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 /* y = A^H x */
224 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
225 {
226   PetscErrorCode                   ierr;
227   Mat_SeqAIJKokkos                 *aijkok;
228   Mat                              B;
229   const char                       *mode;
230   ConstPetscScalarKokkosView       xv;
231   PetscScalarKokkosView            yv;
232 
233   PetscFunctionBegin;
234   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
235   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
236   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
237   if (A->form_explicit_transpose) {
238     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
239     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
240     mode = "N";
241   } else {
242     B    = A;
243     mode = "C";
244   }
245   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
246   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
247   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
248   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
249   ierr = WaitForKokkos();CHKERRQ(ierr);
250   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
251   PetscFunctionReturn(0);
252 }
253 
254 /* z = A x + y */
255 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
256 {
257   PetscErrorCode                   ierr;
258   Mat_SeqAIJKokkos                 *aijkok;
259   ConstPetscScalarKokkosView       xv,yv;
260   PetscScalarKokkosView            zv;
261 
262   PetscFunctionBegin;
263   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
264   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
265   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
266   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
267   if (zz != yy) Kokkos::deep_copy(zv,yv);
268   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
270   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
271   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
272   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
273   ierr = WaitForKokkos();CHKERRQ(ierr);
274   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
275   PetscFunctionReturn(0);
276 }
277 
278 /* z = A^T x + y */
279 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
280 {
281   PetscErrorCode                   ierr;
282   Mat_SeqAIJKokkos                 *aijkok;
283   Mat                              B;
284   const char                       *mode;
285   ConstPetscScalarKokkosView       xv,yv;
286   PetscScalarKokkosView            zv;
287 
288   PetscFunctionBegin;
289   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
290   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
291   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
292   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
293   if (zz != yy) Kokkos::deep_copy(zv,yv);
294   if (A->form_explicit_transpose) {
295     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
296     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
297     mode = "N";
298   } else {
299     B    = A;
300     mode = "T";
301   }
302   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
303   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
304   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
305   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
306   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
307   ierr = WaitForKokkos();CHKERRQ(ierr);
308   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
309   PetscFunctionReturn(0);
310 }
311 
312 /* z = A^H x + y */
313 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
314 {
315   PetscErrorCode                   ierr;
316   Mat_SeqAIJKokkos                 *aijkok;
317   Mat                              B;
318   const char                       *mode;
319   ConstPetscScalarKokkosView       xv,yv;
320   PetscScalarKokkosView            zv;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
324   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
325   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
326   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
327   if (zz != yy) Kokkos::deep_copy(zv,yv);
328   if (A->form_explicit_transpose) {
329     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
330     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
331     mode = "N";
332   } else {
333     B    = A;
334     mode = "C";
335   }
336   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
337   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
338   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
339   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
340   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
341   ierr = WaitForKokkos();CHKERRQ(ierr);
342   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
343   PetscFunctionReturn(0);
344 }
345 
346 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
347 {
348   PetscErrorCode            ierr;
349   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
350 
351   PetscFunctionBegin;
352   switch (op) {
353     case MAT_FORM_EXPLICIT_TRANSPOSE:
354       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
355       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
356       A->form_explicit_transpose = flg;
357       break;
358     default:
359       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
360       break;
361   }
362   PetscFunctionReturn(0);
363 }
364 
365 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
366 {
367   PetscErrorCode ierr;
368   Mat            B;
369   Mat_SeqAIJ     *aij;
370 
371   PetscFunctionBegin;
372   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
373   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
374     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
375   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
376     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
377   }
378 
379   B    = *newmat;
380   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
381   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
382 
383   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
384   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
385   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
386   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
387   aij  = (Mat_SeqAIJ*)B->data;
388   aij->inode.use = PETSC_FALSE;
389 
390   PetscFunctionReturn(0);
391 }
392 
393 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
394 {
395   PetscErrorCode ierr;
396 
397   PetscFunctionBegin;
398   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
399   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
404 {
405   PetscErrorCode             ierr;
406   Mat_SeqAIJKokkos           *aijkok;
407 
408   PetscFunctionBegin;
409   if (A->factortype == MAT_FACTOR_NONE) {
410     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
411     if (aijkok) {
412       if (aijkok->device_mat_d.data()) {
413         delete aijkok->colmap_d;
414         delete aijkok->i_uncompressed_d;
415       }
416       if (aijkok->diag_d) delete aijkok->diag_d;
417     }
418     delete aijkok;
419   } else {
420     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
421   }
422   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
423   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
424   A->spptr = NULL;
425   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
426   PetscFunctionReturn(0);
427 }
428 
429 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
430 {
431   PetscErrorCode ierr;
432 
433   PetscFunctionBegin;
434   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
435   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
436   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
437   PetscFunctionReturn(0);
438 }
439 
440 #if 0
441 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
442 {
443   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
444 
445   PetscFunctionBegin;
446   delete kh;
447   PetscFunctionReturn(0);
448 }
449 
450 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
451 {
452   Mat_Product          *product = C->product;
453   Mat                  A,B;
454   MatProductType       ptype;
455   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
456   bool                 tA,tB;
457   PetscErrorCode       ierr;
458   MatMatKernelHandle_t *kh;
459   Mat_SeqAIJ           *c;
460 
461   PetscFunctionBegin;
462   MatCheckProduct(C,1);
463   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
464   A = product->A;
465   B = product->B;
466   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
467   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
468   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
469   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
470   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
471   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
472   ptype = product->type;
473   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
474   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
475   switch (ptype) {
476   case MATPRODUCT_AB:
477     tA = false;
478     tB = false;
479     break;
480   case MATPRODUCT_AtB:
481     tA = true;
482     tB = false;
483     break;
484   case MATPRODUCT_ABt:
485     tA = false;
486     tB = true;
487     break;
488   default:
489     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
490   }
491 
492   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
493   C->offloadmask = PETSC_OFFLOAD_GPU;
494   /* shorter version of MatAssemblyEnd_SeqAIJ */
495   c = (Mat_SeqAIJ*)C->data;
496   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
497   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
498   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
499   c->reallocs         = 0;
500   C->info.mallocs    += 0;
501   C->info.nz_unneeded = 0;
502   C->assembled = C->was_assembled = PETSC_TRUE;
503   C->num_ass++;
504   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
505   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
506   C->offloadmask = PETSC_OFFLOAD_BOTH;
507   // Also, we should add support to copy back from device to host
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
512 {
513   Mat_Product          *product = C->product;
514   Mat                  A,B;
515   MatProductType       ptype;
516   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
517   PetscInt             m,n,k;
518   bool                 tA,tB;
519   PetscErrorCode       ierr;
520   Mat_SeqAIJ           *c;
521   MatMatKernelHandle_t *kh;
522 
523   PetscFunctionBegin;
524   MatCheckProduct(C,1);
525   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
526   A = product->A;
527   B = product->B;
528   // TODO only copy the i,j data, not the values
529   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
530   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
531   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
532   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
533   ptype = product->type;
534   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
535   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
536   switch (ptype) {
537   case MATPRODUCT_AB:
538     tA = false;
539     tB = false;
540     m = A->rmap->n;
541     n = B->cmap->n;
542     break;
543   case MATPRODUCT_AtB:
544     tA = true;
545     tB = false;
546     m = A->cmap->n;
547     n = B->cmap->n;
548     break;
549   case MATPRODUCT_ABt:
550     tA = false;
551     tB = true;
552     m = A->rmap->n;
553     n = B->rmap->n;
554     break;
555   default:
556     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
557   }
558   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
559   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
560   c = (Mat_SeqAIJ*)C->data;
561 
562   kh = new MatMatKernelHandle_t;
563   // TODO SZ: ADD RUNTIME SELECTION OF THESE
564   kh->set_team_work_size(16);
565   kh->set_dynamic_scheduling(true);
566   // Select an spgemm algorithm, limited by configuration at compile-time and
567   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
568   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
569   std::string myalg("SPGEMM_KK_MEMORY");
570   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
571 
572   // TODO JZ
573   ckok = NULL; //new Mat_SeqAIJKokkos();
574   C->spptr = ckok;
575   KokkosCsrMatrix_t ccsr; // here only to have the code compile
576   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
577 
578   c->singlemalloc = PETSC_FALSE;
579   c->free_a       = PETSC_TRUE;
580   c->free_ij      = PETSC_TRUE;
581   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
582   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
583   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
584 
585   // TODO JZ copy from device to c->i and c->j
586 
587   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
588   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
589   c->maxnz = c->nz;
590   c->nonzerorowcnt = 0;
591   c->rmax = 0;
592   for (k = 0; k < m; k++) {
593     const PetscInt nn = c->i[k+1] - c->i[k];
594     c->ilen[k] = c->imax[k] = nn;
595     c->nonzerorowcnt += (PetscInt)!!nn;
596     c->rmax = PetscMax(c->rmax,nn);
597   }
598 
599   C->nonzerostate++;
600   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
601   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
602   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
603   ckok->nonzerostate = C->nonzerostate;
604   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
605   C->preallocated  = PETSC_TRUE;
606   C->assembled     = PETSC_FALSE;
607   C->was_assembled = PETSC_FALSE;
608 
609   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
610   C->product->data = kh;
611   C->product->destroy = MatMatKernelHandleDestroy_Private;
612   PetscFunctionReturn(0);
613 }
614 
615 /* handles sparse matrix matrix ops */
616 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
617 {
618   Mat_Product    *product = mat->product;
619   PetscErrorCode ierr;
620   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
621 
622   PetscFunctionBegin;
623   MatCheckProduct(mat,1);
624   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
625   if (product->type == MATPRODUCT_ABC) {
626     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
627   }
628   if (Biskok && Ciskok) {
629     switch (product->type) {
630     case MATPRODUCT_AB:
631     case MATPRODUCT_AtB:
632     case MATPRODUCT_ABt:
633       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
634       break;
635     case MATPRODUCT_PtAP:
636     case MATPRODUCT_RARt:
637     case MATPRODUCT_ABC:
638       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
639       break;
640     default:
641       break;
642     }
643   } else { /* fallback for AIJ */
644     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
645   }
646   PetscFunctionReturn(0);
647 }
648 #endif
649 
650 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
651 {
652   PetscErrorCode    ierr;
653   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
654   PetscFunctionBegin;
655   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
656   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
661 {
662   PetscErrorCode   ierr;
663   Mat_SeqAIJKokkos *aijkok;
664 
665   PetscFunctionBegin;
666   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
667   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
668   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
669   A->offloadmask = PETSC_OFFLOAD_GPU;
670   ierr = WaitForKokkos();CHKERRQ(ierr);
671   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
672   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
673   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
674   PetscFunctionReturn(0);
675 }
676 
677 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
678 {
679   PetscErrorCode   ierr;
680   PetscBool        both = PETSC_FALSE;
681   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
682   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
683 
684   PetscFunctionBegin;
685   if (aijkok && aijkok->a_d.data()) {
686     KokkosBlas::fill(aijkok->a_d,0.0);
687     both = PETSC_TRUE;
688   }
689   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
690   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
691   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
692   else A->offloadmask = PETSC_OFFLOAD_CPU;
693   PetscFunctionReturn(0);
694 }
695 
696 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
697 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
698 {
699   PetscErrorCode     ierr;
700   Mat_SeqAIJKokkos   *aijkok;
701 
702   PetscFunctionBegin;
703   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
704   PetscValidPointer(kv,2);
705   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
706   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708   *kv    = aijkok->a_d;
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
713 {
714   PetscFunctionBegin;
715   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
716   PetscValidPointer(kv,2);
717   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
718   PetscFunctionReturn(0);
719 }
720 
721 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
722 {
723   PetscErrorCode     ierr;
724   Mat_SeqAIJKokkos   *aijkok;
725 
726   PetscFunctionBegin;
727   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
728   PetscValidPointer(kv,2);
729   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
730   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
731   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
732   *kv    = aijkok->a_d;
733   PetscFunctionReturn(0);
734 }
735 
736 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
737 {
738   PetscErrorCode     ierr;
739 
740   PetscFunctionBegin;
741   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
742   PetscValidPointer(kv,2);
743   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
744   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
745   PetscFunctionReturn(0);
746 }
747 
748 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
749 {
750   Mat_SeqAIJKokkos   *aijkok;
751 
752   PetscFunctionBegin;
753   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
754   PetscValidPointer(kv,2);
755   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
756   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
757   *kv    = aijkok->a_d;
758   PetscFunctionReturn(0);
759 }
760 
761 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
762 {
763   PetscErrorCode     ierr;
764 
765   PetscFunctionBegin;
766   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
767   PetscValidPointer(kv,2);
768   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
769   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
770   PetscFunctionReturn(0);
771 }
772 
773 /* Computes Y += a*X */
774 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
775 {
776   PetscErrorCode ierr;
777   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
778 
779   PetscFunctionBegin;
780   if (X->ops->axpy != Y->ops->axpy) {
781     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
782     PetscFunctionReturn(0);
783   }
784 
785   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
786     PetscBool e;
787     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
788     if (e) {
789       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
790       if (e) str = SAME_NONZERO_PATTERN;
791     }
792   }
793 
794   if (str != SAME_NONZERO_PATTERN) {
795     /* TODO: do MatAXPY on device */
796     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
797     PetscFunctionReturn(0);
798   } else {
799     ConstPetscScalarKokkosView xv;
800     PetscScalarKokkosView      yv;
801 
802     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
803     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
804     KokkosBlas::axpy(a,xv,yv);
805     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
806     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
807   }
808   PetscFunctionReturn(0);
809 }
810 
811 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
812 {
813   PetscErrorCode ierr;
814 
815   PetscFunctionBegin;
816   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
817   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
818   B->offloadmask = PETSC_OFFLOAD_CPU;
819   PetscFunctionReturn(0);
820 }
821 
822 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
823 {
824   PetscFunctionBegin;
825   A->boundtocpu = PETSC_FALSE;
826 
827   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
828   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
829   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
830   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
831   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
832   A->ops->scale                     = MatScale_SeqAIJKokkos;
833   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
834   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
835   A->ops->mult                      = MatMult_SeqAIJKokkos;
836   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
837   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
838   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
839   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
840   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
841   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
842   PetscFunctionReturn(0);
843 }
844 
845 /* --------------------------------------------------------------------------------*/
846 /*@C
847    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
848    (the default parallel PETSc format). This matrix will ultimately be handled by
849    Kokkos for calculations. For good matrix
850    assembly performance the user should preallocate the matrix storage by setting
851    the parameter nz (or the array nnz).  By setting these parameters accurately,
852    performance during matrix assembly can be increased by more than a factor of 50.
853 
854    Collective
855 
856    Input Parameters:
857 +  comm - MPI communicator, set to PETSC_COMM_SELF
858 .  m - number of rows
859 .  n - number of columns
860 .  nz - number of nonzeros per row (same for all rows)
861 -  nnz - array containing the number of nonzeros in the various rows
862          (possibly different for each row) or NULL
863 
864    Output Parameter:
865 .  A - the matrix
866 
867    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
868    MatXXXXSetPreallocation() paradgm instead of this routine directly.
869    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
870 
871    Notes:
872    If nnz is given then nz is ignored
873 
874    The AIJ format (also called the Yale sparse matrix format or
875    compressed row storage), is fully compatible with standard Fortran 77
876    storage.  That is, the stored row and column indices can begin at
877    either one (as in Fortran) or zero.  See the users' manual for details.
878 
879    Specify the preallocated storage with either nz or nnz (not both).
880    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
881    allocation.  For large problems you MUST preallocate memory or you
882    will get TERRIBLE performance, see the users' manual chapter on matrices.
883 
884    By default, this format uses inodes (identical nodes) when possible, to
885    improve numerical efficiency of matrix-vector products and solves. We
886    search for consecutive rows with the same nonzero structure, thereby
887    reusing matrix information to achieve increased efficiency.
888 
889    Level: intermediate
890 
891 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
892 @*/
893 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
894 {
895   PetscErrorCode ierr;
896 
897   PetscFunctionBegin;
898   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
899   ierr = MatCreate(comm,A);CHKERRQ(ierr);
900   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
901   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
902   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
903   PetscFunctionReturn(0);
904 }
905 
906 typedef Kokkos::TeamPolicy<>::member_type team_member;
907 //
908 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
909 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
910 //
911 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
912 {
913   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
914   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
915   IS                 isrow = b->row,isicol = b->icol;
916   PetscErrorCode     ierr;
917   const PetscInt     *r_h,*ic_h;
918   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
919   const PetscScalar  *aa_d = aijkok->a_d.data();
920   PetscScalar        *ba_d = baijkok->a_d.data();
921   PetscBool          row_identity,col_identity;
922   PetscInt           nc, Nf, nVec=32; // should be a parameter
923   PetscContainer     container;
924 
925   PetscFunctionBegin;
926   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
927   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
928   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
929   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
930   if (container) {
931     PetscInt *pNf=NULL, nv;
932     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
933     Nf = (*pNf)%1000;
934     nv = (*pNf)/1000;
935     if (nv>0) nVec = nv;
936   } else Nf = 1;
937   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
938   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
939   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
940   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
941 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
942   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
943 #endif
944   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
945   {
946 #define KOKKOS_SHARED_LEVEL 1
947     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
948     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
949     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
950     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
951     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
952     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
953     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
954     size_t flops_h = 0.0;
955     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
956     Kokkos::View<size_t> d_flops_k ("flops");
957     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
958     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
959     Kokkos::deep_copy (d_flops_k, h_flops_k);
960     Kokkos::deep_copy (d_r_k, h_r_k);
961     Kokkos::deep_copy (d_ic_k, h_ic_k);
962     // Fill A --> fact
963     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
964         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
965         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
966         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
967         // zero rows of B
968         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
969             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
970             PetscScalar *baL = ba_d + bi_d[rowb];
971             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
972             /* zero (unfactored row) */
973             for (int j=0;j<nzbL;j++) baL[j] = 0;
974             for (int j=0;j<nzbU;j++) baU[j] = 0;
975           });
976         // copy A into B
977         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
978             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
979             const PetscScalar *av    = aa_d + ai_d[rowa];
980             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
981             /* load in initial (unfactored row) */
982             for (int j=0;j<nza;j++) {
983               PetscInt    colb = ic[ajtmp[j]];
984               PetscScalar vala = av[j];
985               if (colb == rowb) {
986                 *(ba_d + bdiag_d[rowb]) = vala;
987               } else {
988                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
989                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
990                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
991                 for (int j=0; j<nz ; j++) {
992                   if (pbj[j] == colb) {
993                     pba[j] = vala;
994                     set++;
995                     break;
996                   }
997                 }
998                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
999               }
1000             }
1001           });
1002       });
1003     Kokkos::fence();
1004 
1005     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1006         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1007         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1008         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1009         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
1010         const PetscInt  start = field*nloc, end = start + nloc;
1011         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1012         // A22 panel update for each row A(1,:) and col A(:,1)
1013         for (int ii=start; ii<end-1; ii++) {
1014           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1015           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1016           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1017           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1018           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1019               PetscInt kIdx = j*Ni + field_block_idx;
1020               if (kIdx >= nzUi) /* void */ ;
1021               else {
1022                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1023                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1024                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1025                 size_t         st_idx;
1026                 // find and do L(k,i) = A(:k,i) / A(i,i)
1027                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1028                 // get column, there has got to be a better way
1029                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1030                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
1031                     if (pjL[j] == ii) {
1032                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1033                       idx = j; // output
1034                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1035                     }
1036                   }, st_idx);
1037                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1038                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
1039                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1040                   // U(i+1,:end)
1041                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1042                       PetscScalar Uij = baUi[uiIdx];
1043                       PetscInt    col = bjUi[uiIdx];
1044                       if (col==myk) {
1045                         // A_kk = A_kk - L_ki * U_ij(k)
1046                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1047                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1048                       } else {
1049                         PetscScalar    *start, *end, *pAkjv=NULL;
1050                         PetscInt       high, low;
1051                         const PetscInt *startj;
1052                         if (col<myk) { // L
1053                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1054                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1055                           start = pLki+1; // start at pLki+1, A22(myk,1)
1056                           startj= bj_d + bi_d[myk] + idx;
1057                           end   = ba_d + bi_d[myk+1];
1058                         } else {
1059                           PetscInt idx = bdiag_d[myk+1]+1;
1060                           start = ba_d + idx;
1061                           startj= bj_d + idx;
1062                           end   = ba_d + bdiag_d[myk];
1063                         }
1064                         // search for 'col', use bisection search - TODO
1065                         low  = 0;
1066                         high = (PetscInt)(end-start);
1067                         while (high-low > 5) {
1068                           int t = (low+high)/2;
1069                           if (startj[t] > col) high = t;
1070                           else                 low  = t;
1071                         }
1072                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1073                           if (startj[pAkjv-start] == col) break;
1074                         }
1075                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
1076                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1077                       }
1078                     });
1079                 }
1080               }
1081             });
1082           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1083           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1084         } /* endof for (i=0; i<n; i++) { */
1085         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1086       });
1087     Kokkos::fence();
1088     Kokkos::deep_copy (h_flops_k, d_flops_k);
1089 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1090     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1091 #elif  defined(PETSC_USE_LOG)
1092     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1093 #endif
1094     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1095         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1096         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1097         /* Invert diagonal for simpler triangular solves */
1098         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1099             int i = start + outer_index*Ni + lg_rank%Ni;
1100             if (i < end) {
1101               PetscScalar *pv = ba_d + bdiag_d[i];
1102               *pv = 1.0/(*pv);
1103             }
1104           });
1105       });
1106   }
1107 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1108   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1109 #endif
1110   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1111   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1112 
1113   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1114   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1115   if (b->inode.size) {
1116     B->ops->solve = MatSolve_SeqAIJ_Inode;
1117   } else if (row_identity && col_identity) {
1118     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1119   } else {
1120     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1121   }
1122   B->offloadmask = PETSC_OFFLOAD_GPU;
1123   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1124   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1125   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1126   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1127   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1128   B->assembled              = PETSC_TRUE;
1129   B->preallocated           = PETSC_TRUE;
1130 
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1135 {
1136   PetscErrorCode   ierr;
1137 
1138   PetscFunctionBegin;
1139   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1140   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1141   PetscFunctionReturn(0);
1142 }
1143 
1144 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1145 {
1146   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1147 
1148   PetscFunctionBegin;
1149   if (!factors->sptrsv_symbolic_completed) {
1150     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1151     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1152     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1153   }
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 /* Check if we need to update factors etc for transpose solve */
1158 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1159 {
1160   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1161   MatColumnIndexType             n = A->rmap->n;
1162 
1163   PetscFunctionBegin;
1164   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1165     /* Update L^T and do sptrsv symbolic */
1166     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
1167     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1168     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1169     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1170 
1171     KokkosKernels::Impl::transpose_matrix<
1172       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1173       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1174       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1175         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1176         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1177 
1178     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1179       We have to sort the indices, until KK provides finer control options.
1180     */
1181     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1182       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1183         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1184 
1185     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1186 
1187     /* Update U^T and do sptrsv symbolic */
1188     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
1189     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1190     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1191     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1192 
1193     KokkosKernels::Impl::transpose_matrix<
1194       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1195       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1196       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1197         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1198         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1199 
1200     /* Sort indices. See comments above */
1201     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1202       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1203         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1204 
1205     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1206     factors->transpose_updated = PETSC_TRUE;
1207   }
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 /* Solve Ax = b, with A = LU */
1212 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1213 {
1214   PetscErrorCode                 ierr;
1215   ConstPetscScalarKokkosView     bv;
1216   PetscScalarKokkosView          xv;
1217   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1218 
1219   PetscFunctionBegin;
1220   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1221   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1222   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1223   /* Solve L tmpv = b */
1224   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
1225   /* Solve Ux = tmpv */
1226   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1227   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1228   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 /* Solve A^T x = b, with A^T = U^T L^T */
1233 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1234 {
1235   PetscErrorCode                 ierr;
1236   ConstPetscScalarKokkosView     bv;
1237   PetscScalarKokkosView          xv;
1238   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1239 
1240   PetscFunctionBegin;
1241   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1242   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1243   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1244   /* Solve U^T tmpv = b */
1245   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1246 
1247   /* Solve L^T x = tmpv */
1248   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1249   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1250   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1255 {
1256   PetscErrorCode                 ierr;
1257   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1258   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1259   PetscInt                       fill_lev = info->levels;
1260 
1261   PetscFunctionBegin;
1262   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1263   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1264 
1265   B->assembled                       = PETSC_TRUE;
1266   B->preallocated                    = PETSC_TRUE;
1267   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1268   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1269   B->ops->matsolve                   = NULL;
1270   B->ops->matsolvetranspose          = NULL;
1271   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1272 
1273   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1274   factors->transpose_updated         = PETSC_FALSE;
1275   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1280 {
1281   PetscErrorCode                 ierr;
1282   Mat_SeqAIJKokkos               *aijkok;
1283   Mat_SeqAIJ                     *b;
1284   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1285   PetscInt                       fill_lev = info->levels;
1286   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1287   PetscInt                       n = A->rmap->n;
1288 
1289   PetscFunctionBegin;
1290   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1291   /* Rebuild factors */
1292   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1293   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1294 
1295   /* Create a spiluk handle and then do symbolic factorization */
1296   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1297   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1298 
1299   auto spiluk_handle = factors->kh.get_spiluk_handle();
1300 
1301   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1302   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1303   Kokkos::realloc(factors->iU_d,n+1);
1304   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1305 
1306   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1307   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1308   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1309 
1310   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1311   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1312   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1313   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1314 
1315   /* TODO: add options to select sptrsv algorithms */
1316   /* Create sptrsv handles for L, U and their transpose */
1317  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1318   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1319  #else
1320   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1321  #endif
1322 
1323   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1324   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1325   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1326   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1327 
1328   /* Fill fields of the factor matrix B */
1329   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1330   b     = (Mat_SeqAIJ*)B->data;
1331   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1332   B->info.fill_ratio_given  = info->fill;
1333   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1334 
1335   B->offloadmask            = PETSC_OFFLOAD_GPU;
1336   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1341 {
1342   PetscErrorCode   ierr;
1343   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1344   const PetscInt   nrows   = A->rmap->n;
1345 
1346   PetscFunctionBegin;
1347   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1348   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1349   // move B data into Kokkos
1350   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1351   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1352   {
1353     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1354     if (!baijkok->diag_d) {
1355       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1356       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1357       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1358     }
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1364 {
1365   PetscFunctionBegin;
1366   *type = MATSOLVERKOKKOS;
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1371 {
1372   PetscFunctionBegin;
1373   *type = MATSOLVERKOKKOSDEVICE;
1374   PetscFunctionReturn(0);
1375 }
1376 
1377 /*MC
1378   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1379   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1380 
1381   Level: beginner
1382 
1383 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1384 M*/
1385 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1386 {
1387   PetscErrorCode ierr;
1388   PetscInt       n = A->rmap->n;
1389 
1390   PetscFunctionBegin;
1391   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1392   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1393   (*B)->factortype = ftype;
1394   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1395   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1396 
1397   if (ftype == MAT_FACTOR_LU) {
1398     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1399     (*B)->canuseordering         = PETSC_TRUE;
1400     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1401   } else if (ftype == MAT_FACTOR_ILU) {
1402     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1403     (*B)->canuseordering         = PETSC_FALSE;
1404     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1405   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1406 
1407   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1408   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1409   PetscFunctionReturn(0);
1410 }
1411 
1412 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1413 {
1414   PetscErrorCode ierr;
1415   PetscInt       n = A->rmap->n;
1416 
1417   PetscFunctionBegin;
1418   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1419   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1420   (*B)->factortype = ftype;
1421   (*B)->canuseordering = PETSC_TRUE;
1422   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1423   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1424 
1425   if (ftype == MAT_FACTOR_LU) {
1426     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1427     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1428   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1429 
1430   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1431   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1436 {
1437   PetscErrorCode ierr;
1438 
1439   PetscFunctionBegin;
1440   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1441   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1442   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1443   PetscFunctionReturn(0);
1444 }
1445 
1446