xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f5ff9c666c0d37e8a7ec3aa2f8e2aa9e44449bdb)
1 #include <petsc/private/petscimpl.h>
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscvec_kokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <../src/mat/impls/aij/seq/aij.h>
11 
12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13 #include <petscmat.h>
14 
15 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
16 
17 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
18 {
19   PetscErrorCode    ierr;
20   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
21 
22   PetscFunctionBegin;
23   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
24   if (aijkok && aijkok->device_mat_d.data()) {
25     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
26   }
27   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
28   PetscFunctionReturn(0);
29 }
30 
31 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
32 {
33   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
34   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
35   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
36 
37   PetscFunctionBegin;
38   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
39   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
40     delete aijkok;
41     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
42     aijkok->nonzerostate = A->nonzerostate;
43     A->spptr             = aijkok;
44   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
45     // Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
46     aijkok->a_dual.modify_host(); /* Mark host is modified */
47     aijkok->a_dual.sync_device(); /* Sync the device */
48     aijkok->need_sync_At_values = PETSC_TRUE; /* Mark matT and matH need to sync, but don't sync them now */
49     aijkok->need_sync_Ah_values = PETSC_TRUE;
50   }
51   A->offloadmask = PETSC_OFFLOAD_BOTH;
52   PetscFunctionReturn(0);
53 }
54 
55 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
56 {
57   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58 
59   PetscFunctionBegin;
60   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
61   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
62     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
63     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
64     A->offloadmask = PETSC_OFFLOAD_BOTH;
65   }
66   PetscFunctionReturn(0);
67 }
68 
69 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
70 {
71   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
72   PetscErrorCode ierr;
73 
74   PetscFunctionBegin;
75   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
76   *array = a->a;
77   A->offloadmask = PETSC_OFFLOAD_CPU;
78   PetscFunctionReturn(0);
79 }
80 
81 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
82 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
83 {
84   Mat_SeqAIJKokkos *aijkok;
85   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
86 
87   PetscFunctionBegin;
88   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
89   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
90   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
91   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
92   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
93   PetscFunctionReturn(0);
94 }
95 
96 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
97 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
98 {
99   Mat_SeqAIJKokkos *aijkok;
100 
101   PetscFunctionBegin;
102   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
103   if (aijkok && aijkok->device_mat_d.data()) {
104     *d_mat = aijkok->device_mat_d.data();
105   } else {
106     PetscErrorCode   ierr;
107     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
108     *d_mat  = NULL;
109   }
110   PetscFunctionReturn(0);
111 }
112 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116 
117   PetscFunctionBegin;
118   if (!aijkok->At) { /* Generate At for the first time */
119     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
120     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
121   } else if (aijkok->need_sync_At_values) { /* Only update At values */
122     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
123     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
124   }
125   aijkok->need_sync_At_values = PETSC_FALSE;
126   PetscFunctionReturn(0);
127 }
128 
129 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeHermitian(Mat A)
130 {
131   PetscErrorCode                   ierr;
132   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
133 
134   PetscFunctionBegin;
135   if (!aijkok->Ah) { /* Generate Ah for the first time */
136     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
137     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
138     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
139   } else if (aijkok->need_sync_Ah_values) { /* Only update Ah values */
140     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
141     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
142     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
143   }
144   aijkok->need_sync_Ah_values = PETSC_FALSE;
145   PetscFunctionReturn(0);
146 }
147 
148 /* y = A x */
149 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
150 {
151   PetscErrorCode                   ierr;
152   Mat_SeqAIJKokkos                 *aijkok;
153   ConstPetscScalarKokkosView       xv;
154   PetscScalarKokkosView            yv;
155 
156   PetscFunctionBegin;
157   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
158   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
159   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
160   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
161   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
162   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
163   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
164   ierr   = WaitForKokkos();CHKERRQ(ierr);
165   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
166   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
167   PetscFunctionReturn(0);
168 }
169 
170 /* y = A^T x */
171 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
172 {
173   PetscErrorCode                   ierr;
174   Mat_SeqAIJKokkos                 *aijkok;
175   Mat                              B;
176   const char                       *mode;
177   ConstPetscScalarKokkosView       xv;
178   PetscScalarKokkosView            yv;
179 
180   PetscFunctionBegin;
181   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
182   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
183   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
184   if (A->form_explicit_transpose) {
185     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
186     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
187     mode = "N";
188   } else {
189     B    = A;
190     mode = "T";
191   }
192   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
193   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
194   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
195   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
196   ierr = WaitForKokkos();CHKERRQ(ierr);
197   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
198   PetscFunctionReturn(0);
199 }
200 
201 /* y = A^H x */
202 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
203 {
204   PetscErrorCode                   ierr;
205   Mat_SeqAIJKokkos                 *aijkok;
206   Mat                              B;
207   const char                       *mode;
208   ConstPetscScalarKokkosView       xv;
209   PetscScalarKokkosView            yv;
210 
211   PetscFunctionBegin;
212   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
213   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
214   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
215   if (A->form_explicit_transpose) {
216     ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr);
217     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
218     mode = "N";
219   } else {
220     B    = A;
221     mode = "C";
222   }
223   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
224   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
225   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
226   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
227   ierr = WaitForKokkos();CHKERRQ(ierr);
228   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 /* z = A x + y */
233 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
234 {
235   PetscErrorCode                   ierr;
236   Mat_SeqAIJKokkos                 *aijkok;
237   ConstPetscScalarKokkosView       xv,yv;
238   PetscScalarKokkosView            zv;
239 
240   PetscFunctionBegin;
241   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
242   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
243   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
244   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
245   if (zz != yy) Kokkos::deep_copy(zv,yv);
246   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
247   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
248   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
249   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
250   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
251   ierr = WaitForKokkos();CHKERRQ(ierr);
252   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
253   PetscFunctionReturn(0);
254 }
255 
256 /* z = A^T x + y */
257 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
258 {
259   PetscErrorCode                   ierr;
260   Mat_SeqAIJKokkos                 *aijkok;
261   Mat                              B;
262   const char                       *mode;
263   ConstPetscScalarKokkosView       xv,yv;
264   PetscScalarKokkosView            zv;
265 
266   PetscFunctionBegin;
267   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
268   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
269   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
270   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
271   if (zz != yy) Kokkos::deep_copy(zv,yv);
272   if (A->form_explicit_transpose) {
273     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
274     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
275     mode = "N";
276   } else {
277     B    = A;
278     mode = "T";
279   }
280   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
281   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
282   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
283   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
284   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
285   ierr = WaitForKokkos();CHKERRQ(ierr);
286   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
287   PetscFunctionReturn(0);
288 }
289 
290 /* z = A^H x + y */
291 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
292 {
293   PetscErrorCode                   ierr;
294   Mat_SeqAIJKokkos                 *aijkok;
295   Mat                              B;
296   const char                       *mode;
297   ConstPetscScalarKokkosView       xv,yv;
298   PetscScalarKokkosView            zv;
299 
300   PetscFunctionBegin;
301   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
302   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
303   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
304   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
305   if (zz != yy) Kokkos::deep_copy(zv,yv);
306   if (A->form_explicit_transpose) {
307     ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr);
308     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
309     mode = "N";
310   } else {
311     B    = A;
312     mode = "C";
313   }
314   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
315   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
316   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
317   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
318   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
319   ierr = WaitForKokkos();CHKERRQ(ierr);
320   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
321   PetscFunctionReturn(0);
322 }
323 
324 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
325 {
326   PetscErrorCode            ierr;
327   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
328 
329   PetscFunctionBegin;
330   switch (op) {
331     case MAT_FORM_EXPLICIT_TRANSPOSE:
332       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
333       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
334       A->form_explicit_transpose = flg;
335       break;
336     default:
337       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
338       break;
339   }
340   PetscFunctionReturn(0);
341 }
342 
343 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
344 {
345   PetscErrorCode ierr;
346   Mat            B;
347   Mat_SeqAIJ     *aij;
348 
349   PetscFunctionBegin;
350   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
351   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
352     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
353   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
354     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
355   }
356 
357   B    = *newmat;
358   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
359   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
360   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
361   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
362   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
363   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
364   aij  = (Mat_SeqAIJ*)B->data;
365   aij->inode.use = PETSC_FALSE;
366 
367   B->offloadmask = PETSC_OFFLOAD_CPU;
368   PetscFunctionReturn(0);
369 }
370 
371 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
377   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
378   PetscFunctionReturn(0);
379 }
380 
381 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
382 {
383   PetscErrorCode        ierr;
384   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
385 
386   PetscFunctionBegin;
387   if (aijkok) {
388     if (aijkok->device_mat_d.data()) {
389       delete aijkok->colmap_d;
390       delete aijkok->i_uncompressed_d;
391     }
392     if (aijkok->diag_d) delete aijkok->diag_d;
393   }
394   delete aijkok;
395   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
396   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
397   A->spptr = NULL;
398   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
399   PetscFunctionReturn(0);
400 }
401 
402 #if 0
403 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
404 {
405   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
406 
407   PetscFunctionBegin;
408   delete kh;
409   PetscFunctionReturn(0);
410 }
411 
412 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
413 {
414   Mat_Product          *product = C->product;
415   Mat                  A,B;
416   MatProductType       ptype;
417   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
418   bool                 tA,tB;
419   PetscErrorCode       ierr;
420   MatMatKernelHandle_t *kh;
421   Mat_SeqAIJ           *c;
422 
423   PetscFunctionBegin;
424   MatCheckProduct(C,1);
425   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
426   A = product->A;
427   B = product->B;
428   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
429   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
430   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
431   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
432   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
433   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
434   ptype = product->type;
435   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
436   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
437   switch (ptype) {
438   case MATPRODUCT_AB:
439     tA = false;
440     tB = false;
441     break;
442   case MATPRODUCT_AtB:
443     tA = true;
444     tB = false;
445     break;
446   case MATPRODUCT_ABt:
447     tA = false;
448     tB = true;
449     break;
450   default:
451     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
452   }
453 
454   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
455   C->offloadmask = PETSC_OFFLOAD_GPU;
456   /* shorter version of MatAssemblyEnd_SeqAIJ */
457   c = (Mat_SeqAIJ*)C->data;
458   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
459   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
460   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
461   c->reallocs         = 0;
462   C->info.mallocs    += 0;
463   C->info.nz_unneeded = 0;
464   C->assembled = C->was_assembled = PETSC_TRUE;
465   C->num_ass++;
466   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
467   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
468   C->offloadmask = PETSC_OFFLOAD_BOTH;
469   // Also, we should add support to copy back from device to host
470   PetscFunctionReturn(0);
471 }
472 
473 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
474 {
475   Mat_Product          *product = C->product;
476   Mat                  A,B;
477   MatProductType       ptype;
478   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
479   PetscInt             m,n,k;
480   bool                 tA,tB;
481   PetscErrorCode       ierr;
482   Mat_SeqAIJ           *c;
483   MatMatKernelHandle_t *kh;
484 
485   PetscFunctionBegin;
486   MatCheckProduct(C,1);
487   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
488   A = product->A;
489   B = product->B;
490   // TODO only copy the i,j data, not the values
491   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
492   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
493   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
494   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
495   ptype = product->type;
496   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
497   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
498   switch (ptype) {
499   case MATPRODUCT_AB:
500     tA = false;
501     tB = false;
502     m = A->rmap->n;
503     n = B->cmap->n;
504     break;
505   case MATPRODUCT_AtB:
506     tA = true;
507     tB = false;
508     m = A->cmap->n;
509     n = B->cmap->n;
510     break;
511   case MATPRODUCT_ABt:
512     tA = false;
513     tB = true;
514     m = A->rmap->n;
515     n = B->rmap->n;
516     break;
517   default:
518     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
519   }
520   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
521   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
522   c = (Mat_SeqAIJ*)C->data;
523 
524   kh = new MatMatKernelHandle_t;
525   // TODO SZ: ADD RUNTIME SELECTION OF THESE
526   kh->set_team_work_size(16);
527   kh->set_dynamic_scheduling(true);
528   // Select an spgemm algorithm, limited by configuration at compile-time and
529   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
530   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
531   std::string myalg("SPGEMM_KK_MEMORY");
532   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
533 
534   /////////////////////////////////////
535   // TODO JZ
536   ckok = NULL; //new Mat_SeqAIJKokkos();
537   C->spptr = ckok;
538   KokkosCsrMatrix_t ccsr; // here only to have the code compile
539   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
540   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
541   //c->nz = get_nnz_from_ccsr
542   //////////////////////////////////////
543   c->singlemalloc = PETSC_FALSE;
544   c->free_a       = PETSC_TRUE;
545   c->free_ij      = PETSC_TRUE;
546   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
547   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
548   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
549   ////////////////////////////////////
550   // TODO JZ copy from device to c->i and c->j
551   ////////////////////////////////////
552   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
553   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
554   c->maxnz = c->nz;
555   c->nonzerorowcnt = 0;
556   c->rmax = 0;
557   for (k = 0; k < m; k++) {
558     const PetscInt nn = c->i[k+1] - c->i[k];
559     c->ilen[k] = c->imax[k] = nn;
560     c->nonzerorowcnt += (PetscInt)!!nn;
561     c->rmax = PetscMax(c->rmax,nn);
562   }
563 
564   C->nonzerostate++;
565   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
566   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
567   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
568   ckok->nonzerostate = C->nonzerostate;
569   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
570   C->preallocated  = PETSC_TRUE;
571   C->assembled     = PETSC_FALSE;
572   C->was_assembled = PETSC_FALSE;
573 
574   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
575   C->product->data = kh;
576   C->product->destroy = MatMatKernelHandleDestroy_Private;
577   PetscFunctionReturn(0);
578 }
579 
580 /* handles sparse matrix matrix ops */
581 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
582 {
583   Mat_Product    *product = mat->product;
584   PetscErrorCode ierr;
585   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
586 
587   PetscFunctionBegin;
588   MatCheckProduct(mat,1);
589   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
590   if (product->type == MATPRODUCT_ABC) {
591     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
592   }
593   if (Biskok && Ciskok) {
594     switch (product->type) {
595     case MATPRODUCT_AB:
596     case MATPRODUCT_AtB:
597     case MATPRODUCT_ABt:
598       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
599       break;
600     case MATPRODUCT_PtAP:
601     case MATPRODUCT_RARt:
602     case MATPRODUCT_ABC:
603       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
604       break;
605     default:
606       break;
607     }
608   } else { /* fallback for AIJ */
609     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
610   }
611   PetscFunctionReturn(0);
612 }
613 #endif
614 
615 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
616 {
617   PetscErrorCode ierr;
618 
619   PetscFunctionBegin;
620   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
621   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
622   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
623   PetscFunctionReturn(0);
624 }
625 
626 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
627 {
628   PetscErrorCode    ierr;
629   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
630   PetscFunctionBegin;
631   if (aijkok && aijkok->device_mat_d.data()) {
632     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
633   }
634   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
635   PetscFunctionReturn(0);
636 }
637 
638 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
639 {
640   PetscErrorCode   ierr;
641   Mat_SeqAIJKokkos *aijkok;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
645   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
646   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
647   A->offloadmask = PETSC_OFFLOAD_GPU;
648   ierr = WaitForKokkos();CHKERRQ(ierr);
649   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
650   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
651   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
656 {
657   PetscErrorCode   ierr;
658   PetscBool        both = PETSC_FALSE;
659   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
660   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
661 
662   PetscFunctionBegin;
663   if (aijkok && aijkok->a_d.data()) {
664     KokkosBlas::fill(aijkok->a_d,0.0);
665     both = PETSC_TRUE;
666   }
667   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
668   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
669   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
670   else A->offloadmask = PETSC_OFFLOAD_CPU;
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
675 {
676   PetscErrorCode ierr;
677   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
678 
679   PetscFunctionBegin;
680   if (X->ops->axpy != Y->ops->axpy) {
681     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
682     PetscFunctionReturn(0);
683   }
684   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
685     PetscBool e;
686     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
687     if (e) {
688       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
689       if (e) str = SAME_NONZERO_PATTERN;
690     }
691   }
692   if (str != SAME_NONZERO_PATTERN) {
693     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
694     PetscFunctionReturn(0);
695   } else {
696     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
697       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
698       PetscFunctionReturn(0);
699     } else {
700       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
701       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
702     }
703     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
704     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
705     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
706       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
707       Y->offloadmask = PETSC_OFFLOAD_GPU;
708       ierr = WaitForKokkos();CHKERRQ(ierr);
709       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
710       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
711       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
712     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
713   }
714   PetscFunctionReturn(0);
715 }
716 
717 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
718 {
719   PetscErrorCode ierr;
720 
721   PetscFunctionBegin;
722   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
723   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
724   B->offloadmask = PETSC_OFFLOAD_CPU;
725   PetscFunctionReturn(0);
726 }
727 
728 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
729 {
730   PetscFunctionBegin;
731   A->boundtocpu = PETSC_FALSE;
732 
733   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
734   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
735   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
736   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
737   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
738   A->ops->scale                     = MatScale_SeqAIJKokkos;
739   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
740   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
741   A->ops->mult                      = MatMult_SeqAIJKokkos;
742   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
743   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
744   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
745   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
746   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
747   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
748   PetscFunctionReturn(0);
749 }
750 
751 /* --------------------------------------------------------------------------------*/
752 /*@C
753    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
754    (the default parallel PETSc format). This matrix will ultimately be handled by
755    Kokkos for calculations. For good matrix
756    assembly performance the user should preallocate the matrix storage by setting
757    the parameter nz (or the array nnz).  By setting these parameters accurately,
758    performance during matrix assembly can be increased by more than a factor of 50.
759 
760    Collective
761 
762    Input Parameters:
763 +  comm - MPI communicator, set to PETSC_COMM_SELF
764 .  m - number of rows
765 .  n - number of columns
766 .  nz - number of nonzeros per row (same for all rows)
767 -  nnz - array containing the number of nonzeros in the various rows
768          (possibly different for each row) or NULL
769 
770    Output Parameter:
771 .  A - the matrix
772 
773    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
774    MatXXXXSetPreallocation() paradgm instead of this routine directly.
775    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
776 
777    Notes:
778    If nnz is given then nz is ignored
779 
780    The AIJ format (also called the Yale sparse matrix format or
781    compressed row storage), is fully compatible with standard Fortran 77
782    storage.  That is, the stored row and column indices can begin at
783    either one (as in Fortran) or zero.  See the users' manual for details.
784 
785    Specify the preallocated storage with either nz or nnz (not both).
786    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
787    allocation.  For large problems you MUST preallocate memory or you
788    will get TERRIBLE performance, see the users' manual chapter on matrices.
789 
790    By default, this format uses inodes (identical nodes) when possible, to
791    improve numerical efficiency of matrix-vector products and solves. We
792    search for consecutive rows with the same nonzero structure, thereby
793    reusing matrix information to achieve increased efficiency.
794 
795    Level: intermediate
796 
797 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
798 @*/
799 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
800 {
801   PetscErrorCode ierr;
802 
803   PetscFunctionBegin;
804   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
805   ierr = MatCreate(comm,A);CHKERRQ(ierr);
806   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
807   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
808   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
809   PetscFunctionReturn(0);
810 }
811 
812 // factorizations
813 typedef Kokkos::TeamPolicy<>::member_type team_member;
814 //
815 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
816 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
817 //
818 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
819 {
820   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
821   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
822   IS                 isrow = b->row,isicol = b->icol;
823   PetscErrorCode     ierr;
824   const PetscInt     *r_h,*ic_h;
825   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
826   const PetscScalar  *aa_d = aijkok->a_d.data();
827   PetscScalar        *ba_d = baijkok->a_d.data();
828   PetscBool          row_identity,col_identity;
829   PetscInt           nc, Nf, nVec=32; // should be a parameter
830   PetscContainer     container;
831 
832   PetscFunctionBegin;
833   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
834   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
835   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
836   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
837   if (container) {
838     PetscInt *pNf=NULL, nv;
839     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
840     Nf = (*pNf)%1000;
841     nv = (*pNf)/1000;
842     if (nv>0) nVec = nv;
843   } else Nf = 1;
844   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
845   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
846   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
847   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
848 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
849   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
850 #endif
851   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
852   {
853 #define KOKKOS_SHARED_LEVEL 1
854     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
855     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
856     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
857     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
858     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
859     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
860     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
861     size_t flops_h = 0.0;
862     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
863     Kokkos::View<size_t> d_flops_k ("flops");
864     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
865     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
866     Kokkos::deep_copy (d_flops_k, h_flops_k);
867     Kokkos::deep_copy (d_r_k, h_r_k);
868     Kokkos::deep_copy (d_ic_k, h_ic_k);
869     // Fill A --> fact
870     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
871         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
872         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
873         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
874         // zero rows of B
875         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
876             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
877             PetscScalar *baL = ba_d + bi_d[rowb];
878             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
879             /* zero (unfactored row) */
880             for (int j=0;j<nzbL;j++) baL[j] = 0;
881             for (int j=0;j<nzbU;j++) baU[j] = 0;
882           });
883         // copy A into B
884         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
885             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
886             const PetscScalar *av    = aa_d + ai_d[rowa];
887             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
888             /* load in initial (unfactored row) */
889             for (int j=0;j<nza;j++) {
890               PetscInt    colb = ic[ajtmp[j]];
891               PetscScalar vala = av[j];
892               if (colb == rowb) {
893                 *(ba_d + bdiag_d[rowb]) = vala;
894               } else {
895                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
896                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
897                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
898                 for (int j=0; j<nz ; j++) {
899                   if (pbj[j] == colb) {
900                     pba[j] = vala;
901                     set++;
902                     break;
903                   }
904                 }
905                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
906               }
907             }
908           });
909       });
910     Kokkos::fence();
911 
912     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
913         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
914         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
915         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
916         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
917         const PetscInt  start = field*nloc, end = start + nloc;
918         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
919         // A22 panel update for each row A(1,:) and col A(:,1)
920         for (int ii=start; ii<end-1; ii++) {
921           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
922           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
923           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
924           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
925           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
926               PetscInt kIdx = j*Ni + field_block_idx;
927               if (kIdx >= nzUi) /* void */ ;
928               else {
929                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
930                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
931                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
932                 size_t         st_idx;
933                 // find and do L(k,i) = A(:k,i) / A(i,i)
934                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
935                 // get column, there has got to be a better way
936                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
937                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
938                     if (pjL[j] == ii) {
939                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
940                       idx = j; // output
941                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
942                     }
943                   }, st_idx);
944                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
945                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
946                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
947                   // U(i+1,:end)
948                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
949                       PetscScalar Uij = baUi[uiIdx];
950                       PetscInt    col = bjUi[uiIdx];
951                       if (col==myk) {
952                         // A_kk = A_kk - L_ki * U_ij(k)
953                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
954                         *Akkv = *Akkv - L_ki() * Uij; // UiK
955                       } else {
956                         PetscScalar    *start, *end, *pAkjv=NULL;
957                         PetscInt       high, low;
958                         const PetscInt *startj;
959                         if (col<myk) { // L
960                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
961                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
962                           start = pLki+1; // start at pLki+1, A22(myk,1)
963                           startj= bj_d + bi_d[myk] + idx;
964                           end   = ba_d + bi_d[myk+1];
965                         } else {
966                           PetscInt idx = bdiag_d[myk+1]+1;
967                           start = ba_d + idx;
968                           startj= bj_d + idx;
969                           end   = ba_d + bdiag_d[myk];
970                         }
971                         // search for 'col', use bisection search - TODO
972                         low  = 0;
973                         high = (PetscInt)(end-start);
974                         while (high-low > 5) {
975                           int t = (low+high)/2;
976                           if (startj[t] > col) high = t;
977                           else                 low  = t;
978                         }
979                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
980                           if (startj[pAkjv-start] == col) break;
981                         }
982                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
983                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
984                       }
985                     });
986                 }
987               }
988             });
989           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
990           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
991         } /* endof for (i=0; i<n; i++) { */
992         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
993       });
994     Kokkos::fence();
995     Kokkos::deep_copy (h_flops_k, d_flops_k);
996 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
997     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
998 #elif  defined(PETSC_USE_LOG)
999     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1000 #endif
1001     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1002         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1003         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1004         /* Invert diagonal for simpler triangular solves */
1005         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1006             int i = start + outer_index*Ni + lg_rank%Ni;
1007             if (i < end) {
1008               PetscScalar *pv = ba_d + bdiag_d[i];
1009               *pv = 1.0/(*pv);
1010             }
1011           });
1012       });
1013   }
1014 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1015   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1016 #endif
1017   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1018   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1019 
1020   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1021   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1022   if (b->inode.size) {
1023     B->ops->solve = MatSolve_SeqAIJ_Inode;
1024   } else if (row_identity && col_identity) {
1025     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1026   } else {
1027     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1028   }
1029   B->offloadmask = PETSC_OFFLOAD_GPU;
1030   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1031   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1032   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1033   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1034   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1035   B->assembled              = PETSC_TRUE;
1036   B->preallocated           = PETSC_TRUE;
1037 
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1042 {
1043   PetscErrorCode   ierr;
1044 
1045   PetscFunctionBegin;
1046   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1047   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
1048   PetscFunctionReturn(0);
1049 }
1050 
1051 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1052 {
1053   PetscErrorCode   ierr;
1054   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1055   const PetscInt   nrows   = A->rmap->n;
1056 
1057   PetscFunctionBegin;
1058   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1059   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1060   // move B data into Kokkos
1061   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1062   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1063   {
1064     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1065     if (!baijkok->diag_d) {
1066       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1067       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1068       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1069     }
1070   }
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
1075 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*);
1076 
1077 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1078 {
1079   PetscErrorCode ierr;
1080 
1081   PetscFunctionBegin;
1082   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
1083   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
1088 {
1089   PetscFunctionBegin;
1090   *type = MATSOLVERKOKKOS;
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 // use -pc_factor_mat_solver_type kokkosdevice
1095 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1096 {
1097   PetscFunctionBegin;
1098   *type = MATSOLVERKOKKOSDEVICE;
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 /*MC
1103   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1104   on a single GPU of type, seqaijkokkos, aijkokkos.
1105 
1106   Level: beginner
1107 
1108 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
1109 M*/
1110 
1111 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
1112 {
1113   PetscErrorCode ierr;
1114   PetscInt       n = A->rmap->n;
1115 
1116   PetscFunctionBegin;
1117   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1118   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1119   (*B)->factortype = ftype;
1120   (*B)->useordering = PETSC_TRUE;
1121   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1122 
1123   if (ftype == MAT_FACTOR_LU) {
1124     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1125     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
1126   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1127 
1128   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1129   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
1130   PetscFunctionReturn(0);
1131 }
1132 
1133 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1134 {
1135   PetscErrorCode ierr;
1136   PetscInt       n = A->rmap->n;
1137 
1138   PetscFunctionBegin;
1139   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1140   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1141   (*B)->factortype = ftype;
1142   (*B)->useordering = PETSC_TRUE;
1143   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1144 
1145   if (ftype == MAT_FACTOR_LU) {
1146     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1147     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1149 
1150   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1151   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1152   PetscFunctionReturn(0);
1153 }
1154