xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7d1b7f08ecf63081ec25c2ae9bbc0564ea4e8a8e)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spmv.hpp>
11 #include <../src/mat/impls/aij/seq/aij.h>
12 
13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14 #include <petscmat.h>
15 
16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
17 
18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
19 {
20   PetscErrorCode    ierr;
21   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
22 
23   PetscFunctionBegin;
24   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25   if (aijkok && aijkok->device_mat_d.data()) {
26     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27   }
28   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
29   PetscFunctionReturn(0);
30 }
31 
32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
33 {
34   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
35   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
36   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
37 
38   PetscFunctionBegin;
39   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
40   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
41   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
42     delete aijkok;
43     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
44     aijkok->nonzerostate = A->nonzerostate;
45     A->spptr             = aijkok;
46   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
47     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
48   }
49   A->offloadmask = PETSC_OFFLOAD_BOTH;
50   PetscFunctionReturn(0);
51 }
52 
53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54 {
55   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56 
57   PetscFunctionBegin;
58   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62     A->offloadmask = PETSC_OFFLOAD_BOTH;
63   }
64   PetscFunctionReturn(0);
65 }
66 
67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68 {
69   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70   PetscErrorCode ierr;
71 
72   PetscFunctionBegin;
73   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74   *array = a->a;
75   A->offloadmask = PETSC_OFFLOAD_CPU;
76   PetscFunctionReturn(0);
77 }
78 
79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81 {
82   Mat_SeqAIJKokkos *aijkok;
83   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84 
85   PetscFunctionBegin;
86   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91   PetscFunctionReturn(0);
92 }
93 
94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96 {
97   Mat_SeqAIJKokkos *aijkok;
98 
99   PetscFunctionBegin;
100   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101   if (aijkok && aijkok->device_mat_d.data()) {
102     *d_mat = aijkok->device_mat_d.data();
103   } else {
104     PetscErrorCode   ierr;
105     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106     *d_mat  = NULL;
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 /* y = A x */
112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok;
116   ConstPetscScalarViewDevice_t     xv;
117   PetscScalarViewDevice_t          yv;
118 
119   PetscFunctionBegin;
120   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
121   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
122   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
123   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
124   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
125   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
126   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
127   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128   ierr   = WaitForKokkos();CHKERRQ(ierr);
129   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
130   PetscFunctionReturn(0);
131 }
132 
133 /* y = A^T x */
134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
135 {
136   PetscErrorCode                   ierr;
137   Mat_SeqAIJKokkos                 *aijkok;
138   ConstPetscScalarViewDevice_t     xv;
139   PetscScalarViewDevice_t          yv;
140 
141   PetscFunctionBegin;
142   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
143   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
144   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
145   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
147   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
148   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149   ierr = WaitForKokkos();CHKERRQ(ierr);
150   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /* y = A^H x */
155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
156 {
157   PetscErrorCode                   ierr;
158   Mat_SeqAIJKokkos                 *aijkok;
159   ConstPetscScalarViewDevice_t     xv;
160   PetscScalarViewDevice_t          yv;
161 
162   PetscFunctionBegin;
163   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
166   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
167   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
168   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
169   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170   ierr = WaitForKokkos();CHKERRQ(ierr);
171   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
175 /* z = A x + y */
176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarViewDevice_t     xv,yv;
181   PetscScalarViewDevice_t          zv;
182 
183   PetscFunctionBegin;
184   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
186   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
187   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
188   if (zz != yy) Kokkos::deep_copy(zv,yv);
189   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
190   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
191   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
192   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
193   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194   ierr = WaitForKokkos();CHKERRQ(ierr);
195   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 /* z = A^T x + y */
200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
201 {
202   PetscErrorCode                   ierr;
203   Mat_SeqAIJKokkos                 *aijkok;
204   ConstPetscScalarViewDevice_t     xv,yv;
205   PetscScalarViewDevice_t          zv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
210   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
211   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
212   if (zz != yy) Kokkos::deep_copy(zv,yv);
213   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
214   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
215   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
216   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
217   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218   ierr = WaitForKokkos();CHKERRQ(ierr);
219   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 /* z = A^H x + y */
224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
225 {
226   PetscErrorCode                   ierr;
227   Mat_SeqAIJKokkos                 *aijkok;
228   ConstPetscScalarViewDevice_t     xv,yv;
229   PetscScalarViewDevice_t          zv;
230 
231   PetscFunctionBegin;
232   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
233   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
234   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
235   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
236   if (zz != yy) Kokkos::deep_copy(zv,yv);
237   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
238   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
239   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
240   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
241   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242   ierr = WaitForKokkos();CHKERRQ(ierr);
243   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
248 {
249   PetscErrorCode ierr;
250   Mat            B;
251   Mat_SeqAIJ     *aij;
252 
253   PetscFunctionBegin;
254   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
255   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
256     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
257   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
258     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
259   }
260 
261   B    = *newmat;
262   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
263   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
264   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
265   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268   aij  = (Mat_SeqAIJ*)B->data;
269   aij->inode.use = PETSC_FALSE;
270 
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   PetscFunctionReturn(0);
273 }
274 
275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
276 {
277   PetscErrorCode ierr;
278 
279   PetscFunctionBegin;
280   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
281   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
286 {
287   PetscErrorCode        ierr;
288   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
289 
290   PetscFunctionBegin;
291   if (aijkok) {
292     if (aijkok->device_mat_d.data()) {
293       delete aijkok->colmap_d;
294       delete aijkok->i_uncompressed_d;
295     }
296     if (aijkok->diag_d) delete aijkok->diag_d;
297   }
298   delete aijkok;
299   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
300   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
301   PetscFunctionReturn(0);
302 }
303 
304 #if 0
305 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
306 {
307   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
308 
309   PetscFunctionBegin;
310   delete kh;
311   PetscFunctionReturn(0);
312 }
313 
314 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
315 {
316   Mat_Product          *product = C->product;
317   Mat                  A,B;
318   MatProductType       ptype;
319   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
320   bool                 tA,tB;
321   PetscErrorCode       ierr;
322   MatMatKernelHandle_t *kh;
323   Mat_SeqAIJ           *c;
324 
325   PetscFunctionBegin;
326   MatCheckProduct(C,1);
327   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
328   A = product->A;
329   B = product->B;
330   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
331   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
332   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
333   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
334   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
335   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
336   ptype = product->type;
337   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
338   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
339   switch (ptype) {
340   case MATPRODUCT_AB:
341     tA = false;
342     tB = false;
343     break;
344   case MATPRODUCT_AtB:
345     tA = true;
346     tB = false;
347     break;
348   case MATPRODUCT_ABt:
349     tA = false;
350     tB = true;
351     break;
352   default:
353     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
354   }
355 
356   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
357   C->offloadmask = PETSC_OFFLOAD_GPU;
358   /* shorter version of MatAssemblyEnd_SeqAIJ */
359   c = (Mat_SeqAIJ*)C->data;
360   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
361   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
362   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
363   c->reallocs         = 0;
364   C->info.mallocs    += 0;
365   C->info.nz_unneeded = 0;
366   C->assembled = C->was_assembled = PETSC_TRUE;
367   C->num_ass++;
368   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
369   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
370   C->offloadmask = PETSC_OFFLOAD_BOTH;
371   // Also, we should add support to copy back from device to host
372   PetscFunctionReturn(0);
373 }
374 
375 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
376 {
377   Mat_Product          *product = C->product;
378   Mat                  A,B;
379   MatProductType       ptype;
380   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
381   PetscInt             m,n,k;
382   bool                 tA,tB;
383   PetscErrorCode       ierr;
384   Mat_SeqAIJ           *c;
385   MatMatKernelHandle_t *kh;
386 
387   PetscFunctionBegin;
388   MatCheckProduct(C,1);
389   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
390   A = product->A;
391   B = product->B;
392   // TODO only copy the i,j data, not the values
393   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
394   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
395   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
396   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
397   ptype = product->type;
398   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
399   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
400   switch (ptype) {
401   case MATPRODUCT_AB:
402     tA = false;
403     tB = false;
404     m = A->rmap->n;
405     n = B->cmap->n;
406     break;
407   case MATPRODUCT_AtB:
408     tA = true;
409     tB = false;
410     m = A->cmap->n;
411     n = B->cmap->n;
412     break;
413   case MATPRODUCT_ABt:
414     tA = false;
415     tB = true;
416     m = A->rmap->n;
417     n = B->rmap->n;
418     break;
419   default:
420     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
421   }
422   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
423   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
424   c = (Mat_SeqAIJ*)C->data;
425 
426   kh = new MatMatKernelHandle_t;
427   // TODO SZ: ADD RUNTIME SELECTION OF THESE
428   kh->set_team_work_size(16);
429   kh->set_dynamic_scheduling(true);
430   // Select an spgemm algorithm, limited by configuration at compile-time and
431   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
432   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
433   std::string myalg("SPGEMM_KK_MEMORY");
434   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
435 
436   /////////////////////////////////////
437   // TODO JZ
438   ckok = NULL; //new Mat_SeqAIJKokkos();
439   C->spptr = ckok;
440   KokkosCsrMatrix_t ccsr; // here only to have the code compile
441   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
442   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
443   //c->nz = get_nnz_from_ccsr
444   //////////////////////////////////////
445   c->singlemalloc = PETSC_FALSE;
446   c->free_a       = PETSC_TRUE;
447   c->free_ij      = PETSC_TRUE;
448   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
449   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
450   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
451   ////////////////////////////////////
452   // TODO JZ copy from device to c->i and c->j
453   ////////////////////////////////////
454   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
455   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
456   c->maxnz = c->nz;
457   c->nonzerorowcnt = 0;
458   c->rmax = 0;
459   for (k = 0; k < m; k++) {
460     const PetscInt nn = c->i[k+1] - c->i[k];
461     c->ilen[k] = c->imax[k] = nn;
462     c->nonzerorowcnt += (PetscInt)!!nn;
463     c->rmax = PetscMax(c->rmax,nn);
464   }
465 
466   C->nonzerostate++;
467   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
468   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
469   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
470   ckok->nonzerostate = C->nonzerostate;
471   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
472   C->preallocated  = PETSC_TRUE;
473   C->assembled     = PETSC_FALSE;
474   C->was_assembled = PETSC_FALSE;
475 
476   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
477   C->product->data = kh;
478   C->product->destroy = MatMatKernelHandleDestroy_Private;
479   PetscFunctionReturn(0);
480 }
481 
482 /* handles sparse matrix matrix ops */
483 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
484 {
485   Mat_Product    *product = mat->product;
486   PetscErrorCode ierr;
487   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
488 
489   PetscFunctionBegin;
490   MatCheckProduct(mat,1);
491   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
492   if (product->type == MATPRODUCT_ABC) {
493     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
494   }
495   if (Biskok && Ciskok) {
496     switch (product->type) {
497     case MATPRODUCT_AB:
498     case MATPRODUCT_AtB:
499     case MATPRODUCT_ABt:
500       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
501       break;
502     case MATPRODUCT_PtAP:
503     case MATPRODUCT_RARt:
504     case MATPRODUCT_ABC:
505       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
506       break;
507     default:
508       break;
509     }
510   } else { /* fallback for AIJ */
511     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
512   }
513   PetscFunctionReturn(0);
514 }
515 #endif
516 
517 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
518 {
519   PetscErrorCode ierr;
520 
521   PetscFunctionBegin;
522   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
523   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
524   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
525   PetscFunctionReturn(0);
526 }
527 
528 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
529 {
530   PetscErrorCode    ierr;
531   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
532   PetscFunctionBegin;
533   if (aijkok && aijkok->device_mat_d.data()) {
534     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
535   }
536   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
541 {
542   PetscErrorCode   ierr;
543   Mat_SeqAIJKokkos *aijkok;
544 
545   PetscFunctionBegin;
546   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
547   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
549   A->offloadmask = PETSC_OFFLOAD_GPU;
550   ierr = WaitForKokkos();CHKERRQ(ierr);
551   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
552   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
553   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
558 {
559   PetscErrorCode   ierr;
560   PetscBool        both = PETSC_FALSE;
561   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
562   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
563 
564   PetscFunctionBegin;
565   if (aijkok && aijkok->a_d.data()) {
566     KokkosBlas::fill(aijkok->a_d,0.0);
567     both = PETSC_TRUE;
568   }
569   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
570   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
571   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
572   else A->offloadmask = PETSC_OFFLOAD_CPU;
573   PetscFunctionReturn(0);
574 }
575 
576 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
577 {
578   PetscErrorCode ierr;
579   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
580 
581   PetscFunctionBegin;
582   if (X->ops->axpy != Y->ops->axpy) {
583     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
584     PetscFunctionReturn(0);
585   }
586   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
587     PetscBool e;
588     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
589     if (e) {
590       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
591       if (e) str = SAME_NONZERO_PATTERN;
592     }
593   }
594   if (str != SAME_NONZERO_PATTERN) {
595     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
596     PetscFunctionReturn(0);
597   } else {
598     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
599       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
600       PetscFunctionReturn(0);
601     } else {
602       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
603       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
604     }
605     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
606     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
607     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
608       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
609       Y->offloadmask = PETSC_OFFLOAD_GPU;
610       ierr = WaitForKokkos();CHKERRQ(ierr);
611       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
612       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
613       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
614     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
615   }
616   PetscFunctionReturn(0);
617 }
618 
619 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
620 {
621   PetscErrorCode ierr;
622 
623   PetscFunctionBegin;
624   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
625   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
626   B->offloadmask = PETSC_OFFLOAD_CPU;
627   PetscFunctionReturn(0);
628 }
629 
630 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
631 {
632   PetscFunctionBegin;
633   A->boundtocpu = PETSC_FALSE;
634 
635   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
636   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
637   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
638   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
639   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
640   A->ops->scale                     = MatScale_SeqAIJKokkos;
641   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
642   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
643   A->ops->mult                      = MatMult_SeqAIJKokkos;
644   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
645   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
646   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
647   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
648   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
649   PetscFunctionReturn(0);
650 }
651 
652 /* --------------------------------------------------------------------------------*/
653 /*C
654    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
655    (the default parallel PETSc format). This matrix will ultimately be handled by
656    Kokkos for calculations. For good matrix
657    assembly performance the user should preallocate the matrix storage by setting
658    the parameter nz (or the array nnz).  By setting these parameters accurately,
659    performance during matrix assembly can be increased by more than a factor of 50.
660 
661    Collective
662 
663    Input Parameters:
664 +  comm - MPI communicator, set to PETSC_COMM_SELF
665 .  m - number of rows
666 .  n - number of columns
667 .  nz - number of nonzeros per row (same for all rows)
668 -  nnz - array containing the number of nonzeros in the various rows
669          (possibly different for each row) or NULL
670 
671    Output Parameter:
672 .  A - the matrix
673 
674    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
675    MatXXXXSetPreallocation() paradgm instead of this routine directly.
676    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
677 
678    Notes:
679    If nnz is given then nz is ignored
680 
681    The AIJ format (also called the Yale sparse matrix format or
682    compressed row storage), is fully compatible with standard Fortran 77
683    storage.  That is, the stored row and column indices can begin at
684    either one (as in Fortran) or zero.  See the users' manual for details.
685 
686    Specify the preallocated storage with either nz or nnz (not both).
687    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
688    allocation.  For large problems you MUST preallocate memory or you
689    will get TERRIBLE performance, see the users' manual chapter on matrices.
690 
691    By default, this format uses inodes (identical nodes) when possible, to
692    improve numerical efficiency of matrix-vector products and solves. We
693    search for consecutive rows with the same nonzero structure, thereby
694    reusing matrix information to achieve increased efficiency.
695 
696    Level: intermediate
697 
698 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
699 @*/
700 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
701 {
702   PetscErrorCode ierr;
703 
704   PetscFunctionBegin;
705   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
706   ierr = MatCreate(comm,A);CHKERRQ(ierr);
707   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
708   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
709   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 // factorizations
714 typedef Kokkos::TeamPolicy<>::member_type team_member;
715 //
716 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
717 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
718 //
719 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
720 {
721   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
722   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
723   IS                 isrow = b->row,isicol = b->icol;
724   PetscErrorCode     ierr;
725   const PetscInt     *r_h,*ic_h;
726   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
727   const PetscScalar  *aa_d = aijkok->a_d.data();
728   PetscScalar        *ba_d = baijkok->a_d.data();
729   PetscBool          row_identity,col_identity;
730   PetscInt           nc, Nf, nVec=32; // should be a parameter
731   PetscContainer     container;
732 
733   PetscFunctionBegin;
734   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
735   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
736   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
737   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
738   if (container) {
739     PetscInt *pNf=NULL, nv;
740     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
741     Nf = (*pNf)%1000;
742     nv = (*pNf)/1000;
743     if (nv>0) nVec = nv;
744   } else Nf = 1;
745   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
746   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
747   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
748   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
749 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
750   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
751 #endif
752   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
753   {
754 #define KOKKOS_SHARED_LEVEL 1
755     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
756     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
757     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
758     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
759     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
760     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
761     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
762     size_t flops_h = 0.0;
763     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
764     Kokkos::View<size_t> d_flops_k ("flops");
765     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
766     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
767     Kokkos::deep_copy (d_flops_k, h_flops_k);
768     Kokkos::deep_copy (d_r_k, h_r_k);
769     Kokkos::deep_copy (d_ic_k, h_ic_k);
770     // Fill A --> fact
771     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
772         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
773         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
774         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
775         // zero rows of B
776         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
777             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
778             PetscScalar *baL = ba_d + bi_d[rowb];
779             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
780             /* zero (unfactored row) */
781             for (int j=0;j<nzbL;j++) baL[j] = 0;
782             for (int j=0;j<nzbU;j++) baU[j] = 0;
783           });
784         // copy A into B
785         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
786             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
787             const PetscScalar *av    = aa_d + ai_d[rowa];
788             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
789             /* load in initial (unfactored row) */
790             for (int j=0;j<nza;j++) {
791               PetscInt    colb = ic[ajtmp[j]];
792               PetscScalar vala = av[j];
793               if (colb == rowb) {
794                 *(ba_d + bdiag_d[rowb]) = vala;
795               } else {
796                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
797                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
798                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
799                 for (int j=0; j<nz ; j++) {
800                   if (pbj[j] == colb) {
801                     pba[j] = vala;
802                     set++;
803                     break;
804                   }
805                 }
806                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
807               }
808             }
809           });
810       });
811     Kokkos::fence();
812 
813     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
814         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
815         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
816         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
817         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
818         const PetscInt  start = field*nloc, end = start + nloc;
819         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
820         // A22 panel update for each row A(1,:) and col A(:,1)
821         for (int ii=start; ii<end-1; ii++) {
822           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
823           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
824           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
825           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
826           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
827               PetscInt kIdx = j*Ni + field_block_idx;
828               if (kIdx >= nzUi) /* void */ ;
829               else {
830                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
831                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
832                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
833                 size_t         st_idx;
834                 // find and do L(k,i) = A(:k,i) / A(i,i)
835                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
836                 // get column, there has got to be a better way
837                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
838                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
839                     if (pjL[j] == ii) {
840                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
841                       idx = j; // output
842                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
843                     }
844                   }, st_idx);
845                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
846                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
847                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
848                   // U(i+1,:end)
849                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
850                       PetscScalar Uij = baUi[uiIdx];
851                       PetscInt    col = bjUi[uiIdx];
852                       if (col==myk) {
853                         // A_kk = A_kk - L_ki * U_ij(k)
854                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
855                         *Akkv = *Akkv - L_ki() * Uij; // UiK
856                       } else {
857                         PetscScalar    *start, *end, *pAkjv=NULL;
858                         PetscInt       high, low;
859                         const PetscInt *startj;
860                         if (col<myk) { // L
861                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
862                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
863                           start = pLki+1; // start at pLki+1, A22(myk,1)
864                           startj= bj_d + bi_d[myk] + idx;
865                           end   = ba_d + bi_d[myk+1];
866                         } else {
867                           PetscInt idx = bdiag_d[myk+1]+1;
868                           start = ba_d + idx;
869                           startj= bj_d + idx;
870                           end   = ba_d + bdiag_d[myk];
871                         }
872                         // search for 'col', use bisection search - TODO
873                         low  = 0;
874                         high = (PetscInt)(end-start);
875                         while (high-low > 5) {
876                           int t = (low+high)/2;
877                           if (startj[t] > col) high = t;
878                           else                 low  = t;
879                         }
880                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
881                           if (startj[pAkjv-start] == col) break;
882                         }
883                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
884                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
885                       }
886                     });
887                 }
888               }
889             });
890           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
891           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
892         } /* endof for (i=0; i<n; i++) { */
893         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
894       });
895     Kokkos::fence();
896     Kokkos::deep_copy (h_flops_k, d_flops_k);
897 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
898     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
899 #elif  defined(PETSC_USE_LOG)
900     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
901 #endif
902     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
903         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
904         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
905         /* Invert diagonal for simpler triangular solves */
906         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
907             int i = start + outer_index*Ni + lg_rank%Ni;
908             if (i < end) {
909               PetscScalar *pv = ba_d + bdiag_d[i];
910               *pv = 1.0/(*pv);
911             }
912           });
913       });
914   }
915 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
916   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
917 #endif
918   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
919   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
920 
921   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
922   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
923   if (b->inode.size) {
924     B->ops->solve = MatSolve_SeqAIJ_Inode;
925   } else if (row_identity && col_identity) {
926     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
927   } else {
928     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
929   }
930   B->offloadmask = PETSC_OFFLOAD_GPU;
931   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
932   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
933   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
934   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
935   B->ops->matsolve          = MatMatSolve_SeqAIJ;
936   B->assembled              = PETSC_TRUE;
937   B->preallocated           = PETSC_TRUE;
938 
939   PetscFunctionReturn(0);
940 }
941 
942 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
943 {
944   PetscErrorCode   ierr;
945 
946   PetscFunctionBegin;
947   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
948   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
949   PetscFunctionReturn(0);
950 }
951 
952 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
953 {
954   PetscErrorCode   ierr;
955   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
956   const PetscInt   nrows   = A->rmap->n;
957 
958   PetscFunctionBegin;
959   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
960   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
961   // move B data into Kokkos
962   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
963   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
964   {
965     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
966     if (!baijkok->diag_d) {
967       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
968       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_diag));
969       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
970     }
971   }
972   PetscFunctionReturn(0);
973 }
974 
975 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
976 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*);
977 
978 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
979 {
980   PetscErrorCode ierr;
981 
982   PetscFunctionBegin;
983   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
984   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
985   PetscFunctionReturn(0);
986 }
987 
988 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
989 {
990   PetscFunctionBegin;
991   *type = MATSOLVERKOKKOS;
992   PetscFunctionReturn(0);
993 }
994 
995 // use -pc_factor_mat_solver_type kokkosdevice
996 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
997 {
998   PetscFunctionBegin;
999   *type = MATSOLVERKOKKOSDEVICE;
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 /*MC
1004   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1005   on a single GPU of type, seqaijkokkos, aijkokkos.
1006 
1007   Level: beginner
1008 
1009 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
1010 M*/
1011 
1012 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
1013 {
1014   PetscErrorCode ierr;
1015   PetscInt       n = A->rmap->n;
1016 
1017   PetscFunctionBegin;
1018   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1019   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1020   (*B)->factortype = ftype;
1021   (*B)->useordering = PETSC_TRUE;
1022   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1023 
1024   if (ftype == MAT_FACTOR_LU) {
1025     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1026     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
1027   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1028 
1029   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1030   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
1031   PetscFunctionReturn(0);
1032 }
1033 
1034 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1035 {
1036   PetscErrorCode ierr;
1037   PetscInt       n = A->rmap->n;
1038 
1039   PetscFunctionBegin;
1040   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1041   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1042   (*B)->factortype = ftype;
1043   (*B)->useordering = PETSC_TRUE;
1044   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1045 
1046   if (ftype == MAT_FACTOR_LU) {
1047     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1048     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1049   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1050 
1051   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1052   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1053   PetscFunctionReturn(0);
1054 }
1055