xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 86a275494187766c14dcdd7cc25ffb65d86fd341)
1 #include <petscvec_kokkos.hpp>
2 #include <petsc/private/petscimpl.h>
3 #include <petscsystypes.h>
4 #include <petscerror.h>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spiluk.hpp>
11 #include <KokkosSparse_sptrsv.hpp>
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
16 #include <petscmat.h>
17 
18 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
19 
20 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
21 {
22   PetscErrorCode    ierr;
23   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
24 
25   PetscFunctionBegin;
26   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
27   A->offloadmask = PETSC_OFFLOAD_CPU;
28   if (aijkok && aijkok->device_mat_d.data()) {
29     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
30   }
31   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
32   PetscFunctionReturn(0);
33 }
34 
35 /* Sync CSR data to device if not yet */
36 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
37 {
38   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
39   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
40   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
41 
42   PetscFunctionBegin;
43   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
44   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
45   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
46     delete aijkok;
47     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
48     aijkok->nonzerostate = A->nonzerostate;
49     A->spptr             = aijkok;
50   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
51     aijkok->a_dual.clear_sync_state();
52     aijkok->a_dual.modify_host(); /* Mark host is modified */
53     aijkok->a_dual.sync_device(); /* Sync the device */
54     aijkok->transpose_updated = PETSC_FALSE;
55     aijkok->hermitian_updated = PETSC_FALSE;
56   }
57   A->offloadmask = PETSC_OFFLOAD_BOTH;
58   PetscFunctionReturn(0);
59 }
60 
61 /* Mark the CSR data on device is modified */
62 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
63 {
64   PetscErrorCode   ierr;
65   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
66 
67   PetscFunctionBegin;
68   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
69   aijkok->a_dual.clear_sync_state();
70   aijkok->a_dual.modify_device();
71   aijkok->transpose_updated = PETSC_FALSE;
72   aijkok->hermitian_updated = PETSC_FALSE;
73   A->offloadmask = PETSC_OFFLOAD_GPU;
74   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
75   PetscFunctionReturn(0);
76 }
77 
78 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
79 {
80   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
81 
82   PetscFunctionBegin;
83   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
84    /* We do not expect one needs factors on host  */
85   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
86   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
87     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
88     aijkok->a_dual.clear_sync_state();
89     aijkok->a_dual.modify_device(); /* Mark device is modified */
90     aijkok->a_dual.sync_host(); /* Sync the host */
91     A->offloadmask = PETSC_OFFLOAD_BOTH;
92   }
93   PetscFunctionReturn(0);
94 }
95 
96 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
97 {
98   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
99   PetscErrorCode ierr;
100 
101   PetscFunctionBegin;
102   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
103   *array = a->a;
104   A->offloadmask = PETSC_OFFLOAD_CPU;
105   PetscFunctionReturn(0);
106 }
107 
108 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
109 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
110 {
111   Mat_SeqAIJKokkos *aijkok;
112   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
113 
114   PetscFunctionBegin;
115   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
116   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
117   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
118   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
119   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
120   PetscFunctionReturn(0);
121 }
122 
123 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
124 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
125 {
126   Mat_SeqAIJKokkos *aijkok;
127 
128   PetscFunctionBegin;
129   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130   if (aijkok && aijkok->device_mat_d.data()) {
131     *d_mat = aijkok->device_mat_d.data();
132   } else {
133     PetscErrorCode   ierr;
134     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
135     *d_mat  = NULL;
136   }
137   PetscFunctionReturn(0);
138 }
139 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
140 {
141   PetscErrorCode                   ierr;
142   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
143 
144   PetscFunctionBegin;
145   if (!aijkok->At) { /* Generate At for the first time */
146     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
147     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
148   } else if (!aijkok->transpose_updated) { /* Only update At values */
149     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
150     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
151   }
152   aijkok->transpose_updated = PETSC_TRUE;
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
157 {
158   PetscErrorCode                   ierr;
159   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (!aijkok->Ah) { /* Generate Ah for the first time */
163     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
164     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
165     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
166   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
167     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
168     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
169     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
170   }
171   aijkok->hermitian_updated = PETSC_TRUE;
172   PetscFunctionReturn(0);
173 }
174 
175 /* y = A x */
176 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarKokkosView       xv;
181   PetscScalarKokkosView            yv;
182 
183   PetscFunctionBegin;
184   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
186   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
187   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
189   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
190   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
191   ierr   = WaitForKokkos();CHKERRQ(ierr);
192   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
193   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 }
196 
197 /* y = A^T x */
198 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
199 {
200   PetscErrorCode                   ierr;
201   Mat_SeqAIJKokkos                 *aijkok;
202   Mat                              B;
203   const char                       *mode;
204   ConstPetscScalarKokkosView       xv;
205   PetscScalarKokkosView            yv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
210   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
211   if (A->form_explicit_transpose) {
212     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
213     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
214     mode = "N";
215   } else {
216     B    = A;
217     mode = "T";
218   }
219   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
220   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
221   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
222   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
223   ierr = WaitForKokkos();CHKERRQ(ierr);
224   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 /* y = A^H x */
229 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
230 {
231   PetscErrorCode                   ierr;
232   Mat_SeqAIJKokkos                 *aijkok;
233   Mat                              B;
234   const char                       *mode;
235   ConstPetscScalarKokkosView       xv;
236   PetscScalarKokkosView            yv;
237 
238   PetscFunctionBegin;
239   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
240   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
241   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
242   if (A->form_explicit_transpose) {
243     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
244     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
245     mode = "N";
246   } else {
247     B    = A;
248     mode = "C";
249   }
250   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
251   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
252   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
254   ierr = WaitForKokkos();CHKERRQ(ierr);
255   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 /* z = A x + y */
260 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
261 {
262   PetscErrorCode                   ierr;
263   Mat_SeqAIJKokkos                 *aijkok;
264   ConstPetscScalarKokkosView       xv,yv;
265   PetscScalarKokkosView            zv;
266 
267   PetscFunctionBegin;
268   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
269   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
270   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
271   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
272   if (zz != yy) Kokkos::deep_copy(zv,yv);
273   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
274   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
275   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
276   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
277   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
278   ierr = WaitForKokkos();CHKERRQ(ierr);
279   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
280   PetscFunctionReturn(0);
281 }
282 
283 /* z = A^T x + y */
284 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
285 {
286   PetscErrorCode                   ierr;
287   Mat_SeqAIJKokkos                 *aijkok;
288   Mat                              B;
289   const char                       *mode;
290   ConstPetscScalarKokkosView       xv,yv;
291   PetscScalarKokkosView            zv;
292 
293   PetscFunctionBegin;
294   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
295   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
296   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
297   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
298   if (zz != yy) Kokkos::deep_copy(zv,yv);
299   if (A->form_explicit_transpose) {
300     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
301     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
302     mode = "N";
303   } else {
304     B    = A;
305     mode = "T";
306   }
307   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
308   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
309   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
311   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
312   ierr = WaitForKokkos();CHKERRQ(ierr);
313   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 /* z = A^H x + y */
318 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
319 {
320   PetscErrorCode                   ierr;
321   Mat_SeqAIJKokkos                 *aijkok;
322   Mat                              B;
323   const char                       *mode;
324   ConstPetscScalarKokkosView       xv,yv;
325   PetscScalarKokkosView            zv;
326 
327   PetscFunctionBegin;
328   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
330   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
331   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
332   if (zz != yy) Kokkos::deep_copy(zv,yv);
333   if (A->form_explicit_transpose) {
334     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
335     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
336     mode = "N";
337   } else {
338     B    = A;
339     mode = "C";
340   }
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
342   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = WaitForKokkos();CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
352 {
353   PetscErrorCode            ierr;
354   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355 
356   PetscFunctionBegin;
357   switch (op) {
358     case MAT_FORM_EXPLICIT_TRANSPOSE:
359       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
360       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
361       A->form_explicit_transpose = flg;
362       break;
363     default:
364       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
365       break;
366   }
367   PetscFunctionReturn(0);
368 }
369 
370 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
371 {
372   PetscErrorCode ierr;
373   Mat            B;
374   Mat_SeqAIJ     *aij;
375 
376   PetscFunctionBegin;
377   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
378   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
379     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
380   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
381     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
382   }
383 
384   B    = *newmat;
385   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
386   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
387 
388   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
389   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
390   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
391   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
392   aij  = (Mat_SeqAIJ*)B->data;
393   aij->inode.use = PETSC_FALSE;
394 
395   PetscFunctionReturn(0);
396 }
397 
398 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
399 {
400   PetscErrorCode ierr;
401 
402   PetscFunctionBegin;
403   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
404   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
405   PetscFunctionReturn(0);
406 }
407 
408 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
409 {
410   PetscErrorCode             ierr;
411   Mat_SeqAIJKokkos           *aijkok;
412 
413   PetscFunctionBegin;
414   if (A->factortype == MAT_FACTOR_NONE) {
415     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
416     if (aijkok) {
417       if (aijkok->device_mat_d.data()) {
418         delete aijkok->colmap_d;
419         delete aijkok->i_uncompressed_d;
420       }
421       if (aijkok->diag_d) delete aijkok->diag_d;
422     }
423     delete aijkok;
424   } else {
425     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
426   }
427   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
428   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
429   A->spptr = NULL;
430   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
431   PetscFunctionReturn(0);
432 }
433 
434 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
435 {
436   PetscErrorCode ierr;
437 
438   PetscFunctionBegin;
439   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
440   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
441   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 
445 #if 0
446 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
447 {
448   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
449 
450   PetscFunctionBegin;
451   delete kh;
452   PetscFunctionReturn(0);
453 }
454 
455 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
456 {
457   Mat_Product          *product = C->product;
458   Mat                  A,B;
459   MatProductType       ptype;
460   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
461   bool                 tA,tB;
462   PetscErrorCode       ierr;
463   MatMatKernelHandle_t *kh;
464   Mat_SeqAIJ           *c;
465 
466   PetscFunctionBegin;
467   MatCheckProduct(C,1);
468   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
469   A = product->A;
470   B = product->B;
471   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
472   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
473   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
474   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
475   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
476   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
477   ptype = product->type;
478   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
479   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
480   switch (ptype) {
481   case MATPRODUCT_AB:
482     tA = false;
483     tB = false;
484     break;
485   case MATPRODUCT_AtB:
486     tA = true;
487     tB = false;
488     break;
489   case MATPRODUCT_ABt:
490     tA = false;
491     tB = true;
492     break;
493   default:
494     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
495   }
496 
497   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
498   C->offloadmask = PETSC_OFFLOAD_GPU;
499   /* shorter version of MatAssemblyEnd_SeqAIJ */
500   c = (Mat_SeqAIJ*)C->data;
501   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
502   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
503   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
504   c->reallocs         = 0;
505   C->info.mallocs    += 0;
506   C->info.nz_unneeded = 0;
507   C->assembled = C->was_assembled = PETSC_TRUE;
508   C->num_ass++;
509   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
510   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
511   C->offloadmask = PETSC_OFFLOAD_BOTH;
512   // Also, we should add support to copy back from device to host
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
517 {
518   Mat_Product          *product = C->product;
519   Mat                  A,B;
520   MatProductType       ptype;
521   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
522   PetscInt             m,n,k;
523   bool                 tA,tB;
524   PetscErrorCode       ierr;
525   Mat_SeqAIJ           *c;
526   MatMatKernelHandle_t *kh;
527 
528   PetscFunctionBegin;
529   MatCheckProduct(C,1);
530   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
531   A = product->A;
532   B = product->B;
533   // TODO only copy the i,j data, not the values
534   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
535   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
536   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
538   ptype = product->type;
539   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
540   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
541   switch (ptype) {
542   case MATPRODUCT_AB:
543     tA = false;
544     tB = false;
545     m = A->rmap->n;
546     n = B->cmap->n;
547     break;
548   case MATPRODUCT_AtB:
549     tA = true;
550     tB = false;
551     m = A->cmap->n;
552     n = B->cmap->n;
553     break;
554   case MATPRODUCT_ABt:
555     tA = false;
556     tB = true;
557     m = A->rmap->n;
558     n = B->rmap->n;
559     break;
560   default:
561     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
562   }
563   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
564   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
565   c = (Mat_SeqAIJ*)C->data;
566 
567   kh = new MatMatKernelHandle_t;
568   // TODO SZ: ADD RUNTIME SELECTION OF THESE
569   kh->set_team_work_size(16);
570   kh->set_dynamic_scheduling(true);
571   // Select an spgemm algorithm, limited by configuration at compile-time and
572   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
573   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
574   std::string myalg("SPGEMM_KK_MEMORY");
575   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
576 
577   /////////////////////////////////////
578   // TODO JZ
579   ckok = NULL; //new Mat_SeqAIJKokkos();
580   C->spptr = ckok;
581   KokkosCsrMatrix_t ccsr; // here only to have the code compile
582   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
583   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
584   //c->nz = get_nnz_from_ccsr
585   //////////////////////////////////////
586   c->singlemalloc = PETSC_FALSE;
587   c->free_a       = PETSC_TRUE;
588   c->free_ij      = PETSC_TRUE;
589   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
590   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
591   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
592   ////////////////////////////////////
593   // TODO JZ copy from device to c->i and c->j
594   ////////////////////////////////////
595   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
596   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
597   c->maxnz = c->nz;
598   c->nonzerorowcnt = 0;
599   c->rmax = 0;
600   for (k = 0; k < m; k++) {
601     const PetscInt nn = c->i[k+1] - c->i[k];
602     c->ilen[k] = c->imax[k] = nn;
603     c->nonzerorowcnt += (PetscInt)!!nn;
604     c->rmax = PetscMax(c->rmax,nn);
605   }
606 
607   C->nonzerostate++;
608   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
609   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
610   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
611   ckok->nonzerostate = C->nonzerostate;
612   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
613   C->preallocated  = PETSC_TRUE;
614   C->assembled     = PETSC_FALSE;
615   C->was_assembled = PETSC_FALSE;
616 
617   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
618   C->product->data = kh;
619   C->product->destroy = MatMatKernelHandleDestroy_Private;
620   PetscFunctionReturn(0);
621 }
622 
623 /* handles sparse matrix matrix ops */
624 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
625 {
626   Mat_Product    *product = mat->product;
627   PetscErrorCode ierr;
628   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
629 
630   PetscFunctionBegin;
631   MatCheckProduct(mat,1);
632   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
633   if (product->type == MATPRODUCT_ABC) {
634     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
635   }
636   if (Biskok && Ciskok) {
637     switch (product->type) {
638     case MATPRODUCT_AB:
639     case MATPRODUCT_AtB:
640     case MATPRODUCT_ABt:
641       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
642       break;
643     case MATPRODUCT_PtAP:
644     case MATPRODUCT_RARt:
645     case MATPRODUCT_ABC:
646       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
647       break;
648     default:
649       break;
650     }
651   } else { /* fallback for AIJ */
652     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
653   }
654   PetscFunctionReturn(0);
655 }
656 #endif
657 
658 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
659 {
660   PetscErrorCode    ierr;
661   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
662   PetscFunctionBegin;
663   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
664   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
669 {
670   PetscErrorCode   ierr;
671   Mat_SeqAIJKokkos *aijkok;
672 
673   PetscFunctionBegin;
674   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
675   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
676   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
677   A->offloadmask = PETSC_OFFLOAD_GPU;
678   ierr = WaitForKokkos();CHKERRQ(ierr);
679   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
680   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
681   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
686 {
687   PetscErrorCode   ierr;
688   PetscBool        both = PETSC_FALSE;
689   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
690   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
691 
692   PetscFunctionBegin;
693   if (aijkok && aijkok->a_d.data()) {
694     KokkosBlas::fill(aijkok->a_d,0.0);
695     both = PETSC_TRUE;
696   }
697   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
698   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
699   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
700   else A->offloadmask = PETSC_OFFLOAD_CPU;
701   PetscFunctionReturn(0);
702 }
703 
704 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
705 {
706   PetscErrorCode ierr;
707   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
708 
709   PetscFunctionBegin;
710   if (X->ops->axpy != Y->ops->axpy) {
711     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
712     PetscFunctionReturn(0);
713   }
714   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
715     PetscBool e;
716     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
717     if (e) {
718       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
719       if (e) str = SAME_NONZERO_PATTERN;
720     }
721   }
722   if (str != SAME_NONZERO_PATTERN) {
723     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
724     PetscFunctionReturn(0);
725   } else {
726     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
727       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
728       PetscFunctionReturn(0);
729     } else {
730       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
731       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
732     }
733     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
734     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
735     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
736       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
737       Y->offloadmask = PETSC_OFFLOAD_GPU;
738       ierr = WaitForKokkos();CHKERRQ(ierr);
739       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
740       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
741       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
742     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
743   }
744   PetscFunctionReturn(0);
745 }
746 
747 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
748 {
749   PetscErrorCode ierr;
750 
751   PetscFunctionBegin;
752   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
753   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
754   B->offloadmask = PETSC_OFFLOAD_CPU;
755   PetscFunctionReturn(0);
756 }
757 
758 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
759 {
760   PetscFunctionBegin;
761   A->boundtocpu = PETSC_FALSE;
762 
763   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
764   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
765   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
766   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
767   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
768   A->ops->scale                     = MatScale_SeqAIJKokkos;
769   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
770   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
771   A->ops->mult                      = MatMult_SeqAIJKokkos;
772   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
773   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
774   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
775   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
776   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
777   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
778   PetscFunctionReturn(0);
779 }
780 
781 /* --------------------------------------------------------------------------------*/
782 /*@C
783    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
784    (the default parallel PETSc format). This matrix will ultimately be handled by
785    Kokkos for calculations. For good matrix
786    assembly performance the user should preallocate the matrix storage by setting
787    the parameter nz (or the array nnz).  By setting these parameters accurately,
788    performance during matrix assembly can be increased by more than a factor of 50.
789 
790    Collective
791 
792    Input Parameters:
793 +  comm - MPI communicator, set to PETSC_COMM_SELF
794 .  m - number of rows
795 .  n - number of columns
796 .  nz - number of nonzeros per row (same for all rows)
797 -  nnz - array containing the number of nonzeros in the various rows
798          (possibly different for each row) or NULL
799 
800    Output Parameter:
801 .  A - the matrix
802 
803    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
804    MatXXXXSetPreallocation() paradgm instead of this routine directly.
805    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
806 
807    Notes:
808    If nnz is given then nz is ignored
809 
810    The AIJ format (also called the Yale sparse matrix format or
811    compressed row storage), is fully compatible with standard Fortran 77
812    storage.  That is, the stored row and column indices can begin at
813    either one (as in Fortran) or zero.  See the users' manual for details.
814 
815    Specify the preallocated storage with either nz or nnz (not both).
816    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
817    allocation.  For large problems you MUST preallocate memory or you
818    will get TERRIBLE performance, see the users' manual chapter on matrices.
819 
820    By default, this format uses inodes (identical nodes) when possible, to
821    improve numerical efficiency of matrix-vector products and solves. We
822    search for consecutive rows with the same nonzero structure, thereby
823    reusing matrix information to achieve increased efficiency.
824 
825    Level: intermediate
826 
827 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
828 @*/
829 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
830 {
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
835   ierr = MatCreate(comm,A);CHKERRQ(ierr);
836   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
837   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
838   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
839   PetscFunctionReturn(0);
840 }
841 
842 // factorizations
843 typedef Kokkos::TeamPolicy<>::member_type team_member;
844 //
845 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
846 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
847 //
848 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
849 {
850   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
851   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
852   IS                 isrow = b->row,isicol = b->icol;
853   PetscErrorCode     ierr;
854   const PetscInt     *r_h,*ic_h;
855   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
856   const PetscScalar  *aa_d = aijkok->a_d.data();
857   PetscScalar        *ba_d = baijkok->a_d.data();
858   PetscBool          row_identity,col_identity;
859   PetscInt           nc, Nf, nVec=32; // should be a parameter
860   PetscContainer     container;
861 
862   PetscFunctionBegin;
863   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
864   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
865   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
866   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
867   if (container) {
868     PetscInt *pNf=NULL, nv;
869     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
870     Nf = (*pNf)%1000;
871     nv = (*pNf)/1000;
872     if (nv>0) nVec = nv;
873   } else Nf = 1;
874   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
875   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
876   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
877   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
878 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
879   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
880 #endif
881   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
882   {
883 #define KOKKOS_SHARED_LEVEL 1
884     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
885     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
886     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
887     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
888     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
889     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
890     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
891     size_t flops_h = 0.0;
892     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
893     Kokkos::View<size_t> d_flops_k ("flops");
894     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
895     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
896     Kokkos::deep_copy (d_flops_k, h_flops_k);
897     Kokkos::deep_copy (d_r_k, h_r_k);
898     Kokkos::deep_copy (d_ic_k, h_ic_k);
899     // Fill A --> fact
900     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
901         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
902         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
903         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
904         // zero rows of B
905         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
906             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
907             PetscScalar *baL = ba_d + bi_d[rowb];
908             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
909             /* zero (unfactored row) */
910             for (int j=0;j<nzbL;j++) baL[j] = 0;
911             for (int j=0;j<nzbU;j++) baU[j] = 0;
912           });
913         // copy A into B
914         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
915             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
916             const PetscScalar *av    = aa_d + ai_d[rowa];
917             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
918             /* load in initial (unfactored row) */
919             for (int j=0;j<nza;j++) {
920               PetscInt    colb = ic[ajtmp[j]];
921               PetscScalar vala = av[j];
922               if (colb == rowb) {
923                 *(ba_d + bdiag_d[rowb]) = vala;
924               } else {
925                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
926                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
927                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
928                 for (int j=0; j<nz ; j++) {
929                   if (pbj[j] == colb) {
930                     pba[j] = vala;
931                     set++;
932                     break;
933                   }
934                 }
935                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
936               }
937             }
938           });
939       });
940     Kokkos::fence();
941 
942     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
943         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
944         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
945         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
946         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
947         const PetscInt  start = field*nloc, end = start + nloc;
948         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
949         // A22 panel update for each row A(1,:) and col A(:,1)
950         for (int ii=start; ii<end-1; ii++) {
951           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
952           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
953           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
954           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
955           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
956               PetscInt kIdx = j*Ni + field_block_idx;
957               if (kIdx >= nzUi) /* void */ ;
958               else {
959                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
960                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
961                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
962                 size_t         st_idx;
963                 // find and do L(k,i) = A(:k,i) / A(i,i)
964                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
965                 // get column, there has got to be a better way
966                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
967                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
968                     if (pjL[j] == ii) {
969                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
970                       idx = j; // output
971                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
972                     }
973                   }, st_idx);
974                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
975                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
976                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
977                   // U(i+1,:end)
978                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
979                       PetscScalar Uij = baUi[uiIdx];
980                       PetscInt    col = bjUi[uiIdx];
981                       if (col==myk) {
982                         // A_kk = A_kk - L_ki * U_ij(k)
983                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
984                         *Akkv = *Akkv - L_ki() * Uij; // UiK
985                       } else {
986                         PetscScalar    *start, *end, *pAkjv=NULL;
987                         PetscInt       high, low;
988                         const PetscInt *startj;
989                         if (col<myk) { // L
990                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
991                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
992                           start = pLki+1; // start at pLki+1, A22(myk,1)
993                           startj= bj_d + bi_d[myk] + idx;
994                           end   = ba_d + bi_d[myk+1];
995                         } else {
996                           PetscInt idx = bdiag_d[myk+1]+1;
997                           start = ba_d + idx;
998                           startj= bj_d + idx;
999                           end   = ba_d + bdiag_d[myk];
1000                         }
1001                         // search for 'col', use bisection search - TODO
1002                         low  = 0;
1003                         high = (PetscInt)(end-start);
1004                         while (high-low > 5) {
1005                           int t = (low+high)/2;
1006                           if (startj[t] > col) high = t;
1007                           else                 low  = t;
1008                         }
1009                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1010                           if (startj[pAkjv-start] == col) break;
1011                         }
1012                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
1013                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1014                       }
1015                     });
1016                 }
1017               }
1018             });
1019           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1020           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1021         } /* endof for (i=0; i<n; i++) { */
1022         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1023       });
1024     Kokkos::fence();
1025     Kokkos::deep_copy (h_flops_k, d_flops_k);
1026 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1027     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1028 #elif  defined(PETSC_USE_LOG)
1029     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1030 #endif
1031     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1032         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1033         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1034         /* Invert diagonal for simpler triangular solves */
1035         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1036             int i = start + outer_index*Ni + lg_rank%Ni;
1037             if (i < end) {
1038               PetscScalar *pv = ba_d + bdiag_d[i];
1039               *pv = 1.0/(*pv);
1040             }
1041           });
1042       });
1043   }
1044 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1045   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1046 #endif
1047   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1048   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1049 
1050   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1051   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1052   if (b->inode.size) {
1053     B->ops->solve = MatSolve_SeqAIJ_Inode;
1054   } else if (row_identity && col_identity) {
1055     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1056   } else {
1057     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1058   }
1059   B->offloadmask = PETSC_OFFLOAD_GPU;
1060   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1061   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1062   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1063   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1064   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1065   B->assembled              = PETSC_TRUE;
1066   B->preallocated           = PETSC_TRUE;
1067 
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1072 {
1073   PetscErrorCode   ierr;
1074 
1075   PetscFunctionBegin;
1076   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1077   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1082 {
1083   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1084 
1085   PetscFunctionBegin;
1086   if (!factors->sptrsv_symbolic_completed) {
1087     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1088     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1089     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1090   }
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 /* Check if we need to update factors etc for transpose solve */
1095 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1096 {
1097   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1098   MatColumnIndexType             n = A->rmap->n;
1099 
1100   PetscFunctionBegin;
1101   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1102     /* Update L^T and do sptrsv symbolic */
1103     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
1104     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1105     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1106     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1107 
1108     KokkosKernels::Impl::transpose_matrix<
1109       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1110       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1111       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1112         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1113         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1114 
1115     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1116       We have to sort the indices, until KK provides finer control options.
1117     */
1118     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1119       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1120         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1121 
1122     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1123 
1124     /* Update U^T and do sptrsv symbolic */
1125     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
1126     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1127     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1128     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1129 
1130     KokkosKernels::Impl::transpose_matrix<
1131       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1132       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1133       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1134         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1135         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1136 
1137     /* Sort indices. See comments above */
1138     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1139       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1140         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1141 
1142     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1143     factors->transpose_updated = PETSC_TRUE;
1144   }
1145   PetscFunctionReturn(0);
1146 }
1147 
1148 /* Solve Ax = b, with A = LU */
1149 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1150 {
1151   PetscErrorCode                 ierr;
1152   ConstPetscScalarKokkosView     bv;
1153   PetscScalarKokkosView          xv;
1154   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1155 
1156   PetscFunctionBegin;
1157   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1158   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1159   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1160   /* Solve L tmpv = b */
1161   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
1162   /* Solve Ux = tmpv */
1163   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
1164   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1165   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 /* Solve A^T x = b, with A^T = U^T L^T */
1170 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1171 {
1172   PetscErrorCode                 ierr;
1173   ConstPetscScalarKokkosView     bv;
1174   PetscScalarKokkosView          xv;
1175   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1176 
1177   PetscFunctionBegin;
1178   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1179   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1180   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1181   /* Solve U^T tmpv = b */
1182   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1183 
1184   /* Solve L^T x = tmpv */
1185   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1186   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1187   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1192 {
1193   PetscErrorCode                 ierr;
1194   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1195   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1196   PetscInt                       fill_lev = info->levels;
1197 
1198   PetscFunctionBegin;
1199   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1200   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1201 
1202   B->assembled                       = PETSC_TRUE;
1203   B->preallocated                    = PETSC_TRUE;
1204   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1205   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1206   B->ops->matsolve                   = NULL;
1207   B->ops->matsolvetranspose          = NULL;
1208   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1209 
1210   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1211   factors->transpose_updated         = PETSC_FALSE;
1212   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1213   PetscFunctionReturn(0);
1214 }
1215 
1216 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1217 {
1218   PetscErrorCode                 ierr;
1219   Mat_SeqAIJKokkos               *aijkok;
1220   Mat_SeqAIJ                     *b;
1221   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1222   PetscInt                       fill_lev = info->levels;
1223   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1224   PetscInt                       n = A->rmap->n;
1225 
1226   PetscFunctionBegin;
1227   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1228   /* Rebuild factors */
1229   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1230   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1231 
1232   /* Create a spiluk handle and then do symbolic factorization */
1233   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1234   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1235 
1236   auto spiluk_handle = factors->kh.get_spiluk_handle();
1237 
1238   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1239   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1240   Kokkos::realloc(factors->iU_d,n+1);
1241   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1242 
1243   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1244   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1245   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1246 
1247   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1248   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1249   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1250   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1251 
1252   /* TODO: add options to select sptrsv algorithms */
1253   /* Create sptrsv handles for L, U and their transpose */
1254  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1255   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1256  #else
1257   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1258  #endif
1259 
1260   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1261   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1262   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1263   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1264 
1265   /* Fill fields of the factor matrix B */
1266   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1267   b     = (Mat_SeqAIJ*)B->data;
1268   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1269   B->info.fill_ratio_given  = info->fill;
1270   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1271 
1272   B->offloadmask            = PETSC_OFFLOAD_GPU;
1273   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1278 {
1279   PetscErrorCode   ierr;
1280   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1281   const PetscInt   nrows   = A->rmap->n;
1282 
1283   PetscFunctionBegin;
1284   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1285   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1286   // move B data into Kokkos
1287   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1288   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1289   {
1290     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1291     if (!baijkok->diag_d) {
1292       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1293       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1294       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1295     }
1296   }
1297   PetscFunctionReturn(0);
1298 }
1299 
1300 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1301 {
1302   PetscFunctionBegin;
1303   *type = MATSOLVERKOKKOS;
1304   PetscFunctionReturn(0);
1305 }
1306 
1307 // use -pc_factor_mat_solver_type kokkosdevice
1308 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1309 {
1310   PetscFunctionBegin;
1311   *type = MATSOLVERKOKKOSDEVICE;
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 /*MC
1316   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1317   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1318 
1319   Level: beginner
1320 
1321 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1322 M*/
1323 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1324 {
1325   PetscErrorCode ierr;
1326   PetscInt       n = A->rmap->n;
1327 
1328   PetscFunctionBegin;
1329   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1330   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1331   (*B)->factortype = ftype;
1332   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1333   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1334 
1335   if (ftype == MAT_FACTOR_LU) {
1336     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1337     (*B)->canuseordering         = PETSC_TRUE;
1338     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1339   } else if (ftype == MAT_FACTOR_ILU) {
1340     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1341     (*B)->canuseordering         = PETSC_FALSE;
1342     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1343   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1344 
1345   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1346   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1351 {
1352   PetscErrorCode ierr;
1353   PetscInt       n = A->rmap->n;
1354 
1355   PetscFunctionBegin;
1356   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1357   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1358   (*B)->factortype = ftype;
1359   (*B)->canuseordering = PETSC_TRUE;
1360   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1361   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1362 
1363   if (ftype == MAT_FACTOR_LU) {
1364     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1365     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1366   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1367 
1368   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1369   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1374 {
1375   PetscErrorCode ierr;
1376 
1377   PetscFunctionBegin;
1378   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1379   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1380   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384