xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 930e68a5a1dbab7612595fd12ba2e3af4e1d5d80)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spmv.hpp>
11 #include <../src/mat/impls/aij/seq/aij.h>
12 
13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14 #include <petscmat.h>
15 
16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
17 
18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
19 {
20   PetscErrorCode    ierr;
21   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
22 
23   PetscFunctionBegin;
24   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25   if (aijkok && aijkok->device_mat_d.data()) {
26     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27   }
28   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
29   PetscFunctionReturn(0);
30 }
31 
32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
33 {
34   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
35   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
36   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
37 
38   PetscFunctionBegin;
39   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
40   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
41   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
42     delete aijkok;
43     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
44     aijkok->nonzerostate = A->nonzerostate;
45     A->spptr             = aijkok;
46   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
47     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
48   }
49   A->offloadmask = PETSC_OFFLOAD_BOTH;
50   PetscFunctionReturn(0);
51 }
52 
53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54 {
55   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56 
57   PetscFunctionBegin;
58   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62     A->offloadmask = PETSC_OFFLOAD_BOTH;
63   }
64   PetscFunctionReturn(0);
65 }
66 
67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68 {
69   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70   PetscErrorCode ierr;
71 
72   PetscFunctionBegin;
73   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74   *array = a->a;
75   A->offloadmask = PETSC_OFFLOAD_CPU;
76   PetscFunctionReturn(0);
77 }
78 
79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81 {
82   Mat_SeqAIJKokkos *aijkok;
83   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84 
85   PetscFunctionBegin;
86   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91   PetscFunctionReturn(0);
92 }
93 
94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96 {
97   Mat_SeqAIJKokkos *aijkok;
98 
99   PetscFunctionBegin;
100   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101   if (aijkok && aijkok->device_mat_d.data()) {
102     *d_mat = aijkok->device_mat_d.data();
103   } else {
104     PetscErrorCode   ierr;
105     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106     *d_mat  = NULL;
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 /* y = A x */
112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok;
116   ConstPetscScalarViewDevice_t     xv;
117   PetscScalarViewDevice_t          yv;
118 
119   PetscFunctionBegin;
120   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
121   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
122   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
123   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
124   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
125   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
126   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
127   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128   ierr   = WaitForKokkos();CHKERRQ(ierr);
129   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
130   PetscFunctionReturn(0);
131 }
132 
133 /* y = A^T x */
134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
135 {
136   PetscErrorCode                   ierr;
137   Mat_SeqAIJKokkos                 *aijkok;
138   ConstPetscScalarViewDevice_t     xv;
139   PetscScalarViewDevice_t          yv;
140 
141   PetscFunctionBegin;
142   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
143   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
144   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
145   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
147   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
148   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149   ierr = WaitForKokkos();CHKERRQ(ierr);
150   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /* y = A^H x */
155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
156 {
157   PetscErrorCode                   ierr;
158   Mat_SeqAIJKokkos                 *aijkok;
159   ConstPetscScalarViewDevice_t     xv;
160   PetscScalarViewDevice_t          yv;
161 
162   PetscFunctionBegin;
163   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
166   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
167   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
168   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
169   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170   ierr = WaitForKokkos();CHKERRQ(ierr);
171   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
175 /* z = A x + y */
176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarViewDevice_t     xv,yv;
181   PetscScalarViewDevice_t          zv;
182 
183   PetscFunctionBegin;
184   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
186   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
187   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
188   if (zz != yy) Kokkos::deep_copy(zv,yv);
189   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
190   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
191   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
192   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
193   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194   ierr = WaitForKokkos();CHKERRQ(ierr);
195   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 /* z = A^T x + y */
200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
201 {
202   PetscErrorCode                   ierr;
203   Mat_SeqAIJKokkos                 *aijkok;
204   ConstPetscScalarViewDevice_t     xv,yv;
205   PetscScalarViewDevice_t          zv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
210   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
211   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
212   if (zz != yy) Kokkos::deep_copy(zv,yv);
213   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
214   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
215   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
216   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
217   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218   ierr = WaitForKokkos();CHKERRQ(ierr);
219   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 /* z = A^H x + y */
224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
225 {
226   PetscErrorCode                   ierr;
227   Mat_SeqAIJKokkos                 *aijkok;
228   ConstPetscScalarViewDevice_t     xv,yv;
229   PetscScalarViewDevice_t          zv;
230 
231   PetscFunctionBegin;
232   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
233   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
234   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
235   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
236   if (zz != yy) Kokkos::deep_copy(zv,yv);
237   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
238   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
239   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
240   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
241   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242   ierr = WaitForKokkos();CHKERRQ(ierr);
243   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
248 {
249   PetscErrorCode ierr;
250   Mat            B;
251   Mat_SeqAIJ     *aij;
252 
253   PetscFunctionBegin;
254   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
255   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
256     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
257   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
258     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
259   }
260 
261   B    = *newmat;
262   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
263   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
264   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
265   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268   aij  = (Mat_SeqAIJ*)B->data;
269   aij->inode.use = PETSC_FALSE;
270 
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   PetscFunctionReturn(0);
273 }
274 
275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
276 {
277   PetscErrorCode ierr;
278 
279   PetscFunctionBegin;
280   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
281   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
286 {
287   PetscErrorCode        ierr;
288   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
289 
290   PetscFunctionBegin;
291   if (aijkok && aijkok->device_mat_d.data()) {
292     delete aijkok->colmap_d;
293     delete aijkok->i_uncompressed_d;
294   }
295   delete aijkok;
296   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
297   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
298   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 #if 0
303 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
304 {
305   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
306 
307   PetscFunctionBegin;
308   delete kh;
309   PetscFunctionReturn(0);
310 }
311 
312 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
313 {
314   Mat_Product          *product = C->product;
315   Mat                  A,B;
316   MatProductType       ptype;
317   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
318   bool                 tA,tB;
319   PetscErrorCode       ierr;
320   MatMatKernelHandle_t *kh;
321   Mat_SeqAIJ           *c;
322 
323   PetscFunctionBegin;
324   MatCheckProduct(C,1);
325   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
326   A = product->A;
327   B = product->B;
328   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
330   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
331   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
332   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
333   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
334   ptype = product->type;
335   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
336   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
337   switch (ptype) {
338   case MATPRODUCT_AB:
339     tA = false;
340     tB = false;
341     break;
342   case MATPRODUCT_AtB:
343     tA = true;
344     tB = false;
345     break;
346   case MATPRODUCT_ABt:
347     tA = false;
348     tB = true;
349     break;
350   default:
351     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
352   }
353 
354   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
355   C->offloadmask = PETSC_OFFLOAD_GPU;
356   /* shorter version of MatAssemblyEnd_SeqAIJ */
357   c = (Mat_SeqAIJ*)C->data;
358   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
359   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
360   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
361   c->reallocs         = 0;
362   C->info.mallocs    += 0;
363   C->info.nz_unneeded = 0;
364   C->assembled = C->was_assembled = PETSC_TRUE;
365   C->num_ass++;
366   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
367   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
368   C->offloadmask = PETSC_OFFLOAD_BOTH;
369   // Also, we should add support to copy back from device to host
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
374 {
375   Mat_Product          *product = C->product;
376   Mat                  A,B;
377   MatProductType       ptype;
378   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
379   PetscInt             m,n,k;
380   bool                 tA,tB;
381   PetscErrorCode       ierr;
382   Mat_SeqAIJ           *c;
383   MatMatKernelHandle_t *kh;
384 
385   PetscFunctionBegin;
386   MatCheckProduct(C,1);
387   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
388   A = product->A;
389   B = product->B;
390   // TODO only copy the i,j data, not the values
391   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
392   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
393   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
394   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
395   ptype = product->type;
396   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
397   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
398   switch (ptype) {
399   case MATPRODUCT_AB:
400     tA = false;
401     tB = false;
402     m = A->rmap->n;
403     n = B->cmap->n;
404     break;
405   case MATPRODUCT_AtB:
406     tA = true;
407     tB = false;
408     m = A->cmap->n;
409     n = B->cmap->n;
410     break;
411   case MATPRODUCT_ABt:
412     tA = false;
413     tB = true;
414     m = A->rmap->n;
415     n = B->rmap->n;
416     break;
417   default:
418     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
419   }
420   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
421   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
422   c = (Mat_SeqAIJ*)C->data;
423 
424   kh = new MatMatKernelHandle_t;
425   // TODO SZ: ADD RUNTIME SELECTION OF THESE
426   kh->set_team_work_size(16);
427   kh->set_dynamic_scheduling(true);
428   // Select an spgemm algorithm, limited by configuration at compile-time and
429   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
430   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
431   std::string myalg("SPGEMM_KK_MEMORY");
432   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
433 
434   /////////////////////////////////////
435   // TODO JZ
436   ckok = NULL; //new Mat_SeqAIJKokkos();
437   C->spptr = ckok;
438   KokkosCsrMatrix_t ccsr; // here only to have the code compile
439   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
440   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
441   //c->nz = get_nnz_from_ccsr
442   //////////////////////////////////////
443   c->singlemalloc = PETSC_FALSE;
444   c->free_a       = PETSC_TRUE;
445   c->free_ij      = PETSC_TRUE;
446   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
447   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
448   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
449   ////////////////////////////////////
450   // TODO JZ copy from device to c->i and c->j
451   ////////////////////////////////////
452   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
453   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
454   c->maxnz = c->nz;
455   c->nonzerorowcnt = 0;
456   c->rmax = 0;
457   for (k = 0; k < m; k++) {
458     const PetscInt nn = c->i[k+1] - c->i[k];
459     c->ilen[k] = c->imax[k] = nn;
460     c->nonzerorowcnt += (PetscInt)!!nn;
461     c->rmax = PetscMax(c->rmax,nn);
462   }
463 
464   C->nonzerostate++;
465   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
466   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
467   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
468   ckok->nonzerostate = C->nonzerostate;
469   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
470   C->preallocated  = PETSC_TRUE;
471   C->assembled     = PETSC_FALSE;
472   C->was_assembled = PETSC_FALSE;
473 
474   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
475   C->product->data = kh;
476   C->product->destroy = MatMatKernelHandleDestroy_Private;
477   PetscFunctionReturn(0);
478 }
479 
480 /* handles sparse matrix matrix ops */
481 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
482 {
483   Mat_Product    *product = mat->product;
484   PetscErrorCode ierr;
485   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
486 
487   PetscFunctionBegin;
488   MatCheckProduct(mat,1);
489   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
490   if (product->type == MATPRODUCT_ABC) {
491     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
492   }
493   if (Biskok && Ciskok) {
494     switch (product->type) {
495     case MATPRODUCT_AB:
496     case MATPRODUCT_AtB:
497     case MATPRODUCT_ABt:
498       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
499       break;
500     case MATPRODUCT_PtAP:
501     case MATPRODUCT_RARt:
502     case MATPRODUCT_ABC:
503       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
504       break;
505     default:
506       break;
507     }
508   } else { /* fallback for AIJ */
509     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
510   }
511   PetscFunctionReturn(0);
512 }
513 #endif
514 
515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
516 {
517   PetscErrorCode ierr;
518 
519   PetscFunctionBegin;
520   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
521   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
522   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
527 {
528   PetscErrorCode    ierr;
529   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
530   PetscFunctionBegin;
531   if (aijkok && aijkok->device_mat_d.data()) {
532     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
533   }
534   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
539 {
540   PetscErrorCode   ierr;
541   Mat_SeqAIJKokkos *aijkok;
542 
543   PetscFunctionBegin;
544   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
545   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
547   A->offloadmask = PETSC_OFFLOAD_GPU;
548   ierr = WaitForKokkos();CHKERRQ(ierr);
549   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
550   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
551   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 
555 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
556 {
557   PetscErrorCode   ierr;
558   PetscBool        both = PETSC_FALSE;
559   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
560   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
561 
562   PetscFunctionBegin;
563   if (aijkok && aijkok->a_d.data()) {
564     KokkosBlas::fill(aijkok->a_d,0.0);
565     both = PETSC_TRUE;
566   }
567   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
568   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
569   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
570   else A->offloadmask = PETSC_OFFLOAD_CPU;
571   PetscFunctionReturn(0);
572 }
573 
574 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
575 {
576   PetscErrorCode ierr;
577   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
578 
579   PetscFunctionBegin;
580   if (X->ops->axpy != Y->ops->axpy) {
581     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
582     PetscFunctionReturn(0);
583   }
584   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
585     PetscBool e;
586     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
587     if (e) {
588       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
589       if (e) str = SAME_NONZERO_PATTERN;
590     }
591   }
592   if (str != SAME_NONZERO_PATTERN) {
593     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
594     PetscFunctionReturn(0);
595   } else {
596     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
597       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
598       PetscFunctionReturn(0);
599     } else {
600       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
601       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
602     }
603     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
604     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
605     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
606       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
607       Y->offloadmask = PETSC_OFFLOAD_GPU;
608       ierr = WaitForKokkos();CHKERRQ(ierr);
609       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
610       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
611       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
612     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
613   }
614   PetscFunctionReturn(0);
615 }
616 
617 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
618 {
619   PetscFunctionBegin;
620   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
621   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
622   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
623   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
624   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
625   A->ops->scale                     = MatScale_SeqAIJKokkos;
626   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
627   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
628   A->ops->mult                      = MatMult_SeqAIJKokkos;
629   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
630   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
631   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
632   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
633   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
634   PetscFunctionReturn(0);
635 }
636 
637 /* --------------------------------------------------------------------------------*/
638 /*C
639    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
640    (the default parallel PETSc format). This matrix will ultimately be handled by
641    Kokkos for calculations. For good matrix
642    assembly performance the user should preallocate the matrix storage by setting
643    the parameter nz (or the array nnz).  By setting these parameters accurately,
644    performance during matrix assembly can be increased by more than a factor of 50.
645 
646    Collective
647 
648    Input Parameters:
649 +  comm - MPI communicator, set to PETSC_COMM_SELF
650 .  m - number of rows
651 .  n - number of columns
652 .  nz - number of nonzeros per row (same for all rows)
653 -  nnz - array containing the number of nonzeros in the various rows
654          (possibly different for each row) or NULL
655 
656    Output Parameter:
657 .  A - the matrix
658 
659    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
660    MatXXXXSetPreallocation() paradgm instead of this routine directly.
661    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
662 
663    Notes:
664    If nnz is given then nz is ignored
665 
666    The AIJ format (also called the Yale sparse matrix format or
667    compressed row storage), is fully compatible with standard Fortran 77
668    storage.  That is, the stored row and column indices can begin at
669    either one (as in Fortran) or zero.  See the users' manual for details.
670 
671    Specify the preallocated storage with either nz or nnz (not both).
672    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
673    allocation.  For large problems you MUST preallocate memory or you
674    will get TERRIBLE performance, see the users' manual chapter on matrices.
675 
676    By default, this format uses inodes (identical nodes) when possible, to
677    improve numerical efficiency of matrix-vector products and solves. We
678    search for consecutive rows with the same nonzero structure, thereby
679    reusing matrix information to achieve increased efficiency.
680 
681    Level: intermediate
682 
683 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
684 @*/
685 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
686 {
687   PetscErrorCode ierr;
688 
689   PetscFunctionBegin;
690   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
691   ierr = MatCreate(comm,A);CHKERRQ(ierr);
692   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
693   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
694   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 // factorizations
699 
700 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
701 {
702   // Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
703   // IS             isrow = b->row,iscol = b->col;
704   // PetscBool      row_identity,col_identity;
705   PetscErrorCode ierr;
706 
707   PetscFunctionBegin;
708   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
709   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
710   B->offloadmask = PETSC_OFFLOAD_CPU;
711   // solves stay on CPU now
712   /* determine which version of MatSolve needs to be used. */
713   // ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
714   // ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
715   // if (row_identity && col_identity) {
716   //   B->ops->solve = MatSolve_SeqAIJKOKKOS_NaturalOrdering;
717   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS_NaturalOrdering;
718   //   B->ops->matsolve = NULL;
719   //   B->ops->matsolvetranspose = NULL;
720   // } else {
721   //   B->ops->solve = MatSolve_SeqAIJKOKKOS;
722   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS;
723   //   B->ops->matsolve = NULL;
724   //   B->ops->matsolvetranspose = NULL;
725   // }
726 
727   /* get the triangular factors */
728   // ierr = MatSeqAIJKOKKOSILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731 
732 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
733 {
734   PetscErrorCode               ierr;
735 
736   PetscFunctionBegin;
737   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
738   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
739   PetscFunctionReturn(0);
740 }
741 
742 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
743 
744 
745 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
746 {
747   PetscErrorCode ierr;
748 
749   PetscFunctionBegin;
750   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
751   PetscFunctionReturn(0);
752 }
753 
754 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
755 {
756   PetscFunctionBegin;
757   *type = MATSOLVERKOKKOS;
758   PetscFunctionReturn(0);
759 }
760 
761 /*MC
762   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
763   on a single GPU of type, seqaijkokkos, aijkokkos.
764 
765   Level: beginner
766 
767 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
768 M*/
769 
770 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
771 {
772   PetscErrorCode ierr;
773   PetscInt       n = A->rmap->n;
774 
775   PetscFunctionBegin;
776   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
777   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
778   (*B)->factortype = ftype;
779   (*B)->useordering = PETSC_TRUE;
780   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
781 
782   if (ftype == MAT_FACTOR_LU /* || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT*/) {
783     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
784     // (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKOKKOS;
785     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
786     // } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
787     // (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKOKKOS;
788     // (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKOKKOS;
789   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
790 
791   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
792   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
793   PetscFunctionReturn(0);
794 }
795