xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 8fa9e22e65c6f47173f479ecf868ef7fe40cdcbd)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spmv.hpp>
11 #include <../src/mat/impls/aij/seq/aij.h>
12 
13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14 #include <petscmat.h>
15 
16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
17 
18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
19 {
20   PetscErrorCode    ierr;
21   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
22 
23   PetscFunctionBegin;
24   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25   if (aijkok && aijkok->device_mat_d.data()) {
26     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27   }
28   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
29   PetscFunctionReturn(0);
30 }
31 
32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
33 {
34   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
35   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
36   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
37 
38   PetscFunctionBegin;
39   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
40   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
41   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
42     delete aijkok;
43     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
44     aijkok->nonzerostate = A->nonzerostate;
45     A->spptr             = aijkok;
46   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
47     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
48   }
49   A->offloadmask = PETSC_OFFLOAD_BOTH;
50   PetscFunctionReturn(0);
51 }
52 
53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54 {
55   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56 
57   PetscFunctionBegin;
58   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62     A->offloadmask = PETSC_OFFLOAD_BOTH;
63   }
64   PetscFunctionReturn(0);
65 }
66 
67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68 {
69   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70   PetscErrorCode ierr;
71 
72   PetscFunctionBegin;
73   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74   *array = a->a;
75   A->offloadmask = PETSC_OFFLOAD_CPU;
76   PetscFunctionReturn(0);
77 }
78 
79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81 {
82   Mat_SeqAIJKokkos *aijkok;
83   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84 
85   PetscFunctionBegin;
86   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91   PetscFunctionReturn(0);
92 }
93 
94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96 {
97   Mat_SeqAIJKokkos *aijkok;
98 
99   PetscFunctionBegin;
100   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101   if (aijkok && aijkok->device_mat_d.data()) {
102     *d_mat = aijkok->device_mat_d.data();
103   } else {
104     PetscErrorCode   ierr;
105     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106     *d_mat  = NULL;
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 /* y = A x */
112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok;
116   ConstPetscScalarViewDevice_t     xv;
117   PetscScalarViewDevice_t          yv;
118 
119   PetscFunctionBegin;
120   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
121   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
122   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
123   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
124   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
125   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
126   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
127   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128   ierr   = WaitForKokkos();CHKERRQ(ierr);
129   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
130   PetscFunctionReturn(0);
131 }
132 
133 /* y = A^T x */
134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
135 {
136   PetscErrorCode                   ierr;
137   Mat_SeqAIJKokkos                 *aijkok;
138   ConstPetscScalarViewDevice_t     xv;
139   PetscScalarViewDevice_t          yv;
140 
141   PetscFunctionBegin;
142   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
143   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
144   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
145   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
147   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
148   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149   ierr = WaitForKokkos();CHKERRQ(ierr);
150   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /* y = A^H x */
155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
156 {
157   PetscErrorCode                   ierr;
158   Mat_SeqAIJKokkos                 *aijkok;
159   ConstPetscScalarViewDevice_t     xv;
160   PetscScalarViewDevice_t          yv;
161 
162   PetscFunctionBegin;
163   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
166   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
167   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
168   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
169   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170   ierr = WaitForKokkos();CHKERRQ(ierr);
171   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
175 /* z = A x + y */
176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarViewDevice_t     xv,yv;
181   PetscScalarViewDevice_t          zv;
182 
183   PetscFunctionBegin;
184   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
186   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
187   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
188   if (zz != yy) Kokkos::deep_copy(zv,yv);
189   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
190   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
191   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
192   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
193   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194   ierr = WaitForKokkos();CHKERRQ(ierr);
195   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 /* z = A^T x + y */
200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
201 {
202   PetscErrorCode                   ierr;
203   Mat_SeqAIJKokkos                 *aijkok;
204   ConstPetscScalarViewDevice_t     xv,yv;
205   PetscScalarViewDevice_t          zv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
210   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
211   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
212   if (zz != yy) Kokkos::deep_copy(zv,yv);
213   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
214   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
215   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
216   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
217   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218   ierr = WaitForKokkos();CHKERRQ(ierr);
219   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 /* z = A^H x + y */
224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
225 {
226   PetscErrorCode                   ierr;
227   Mat_SeqAIJKokkos                 *aijkok;
228   ConstPetscScalarViewDevice_t     xv,yv;
229   PetscScalarViewDevice_t          zv;
230 
231   PetscFunctionBegin;
232   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
233   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
234   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
235   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
236   if (zz != yy) Kokkos::deep_copy(zv,yv);
237   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
238   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
239   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
240   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
241   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242   ierr = WaitForKokkos();CHKERRQ(ierr);
243   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
248 {
249   PetscErrorCode ierr;
250   Mat            B;
251   Mat_SeqAIJ     *aij;
252 
253   PetscFunctionBegin;
254   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
255   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
256     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
257   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
258     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
259   }
260 
261   B    = *newmat;
262   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
263   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
264   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
265   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268   aij  = (Mat_SeqAIJ*)B->data;
269   aij->inode.use = PETSC_FALSE;
270 
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   PetscFunctionReturn(0);
273 }
274 
275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
276 {
277   PetscErrorCode ierr;
278 
279   PetscFunctionBegin;
280   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
281   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
286 {
287   PetscErrorCode        ierr;
288   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
289 
290   PetscFunctionBegin;
291   if (aijkok && aijkok->device_mat_d.data()) {
292     delete aijkok->colmap_d;
293     delete aijkok->i_uncompressed_d;
294   }
295   delete aijkok;
296   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
297   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 #if 0
302 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
303 {
304   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
305 
306   PetscFunctionBegin;
307   delete kh;
308   PetscFunctionReturn(0);
309 }
310 
311 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
312 {
313   Mat_Product          *product = C->product;
314   Mat                  A,B;
315   MatProductType       ptype;
316   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
317   bool                 tA,tB;
318   PetscErrorCode       ierr;
319   MatMatKernelHandle_t *kh;
320   Mat_SeqAIJ           *c;
321 
322   PetscFunctionBegin;
323   MatCheckProduct(C,1);
324   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
325   A = product->A;
326   B = product->B;
327   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
328   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
329   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
331   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
332   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
333   ptype = product->type;
334   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
335   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
336   switch (ptype) {
337   case MATPRODUCT_AB:
338     tA = false;
339     tB = false;
340     break;
341   case MATPRODUCT_AtB:
342     tA = true;
343     tB = false;
344     break;
345   case MATPRODUCT_ABt:
346     tA = false;
347     tB = true;
348     break;
349   default:
350     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
351   }
352 
353   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
354   C->offloadmask = PETSC_OFFLOAD_GPU;
355   /* shorter version of MatAssemblyEnd_SeqAIJ */
356   c = (Mat_SeqAIJ*)C->data;
357   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
358   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
359   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
360   c->reallocs         = 0;
361   C->info.mallocs    += 0;
362   C->info.nz_unneeded = 0;
363   C->assembled = C->was_assembled = PETSC_TRUE;
364   C->num_ass++;
365   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
366   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
367   C->offloadmask = PETSC_OFFLOAD_BOTH;
368   // Also, we should add support to copy back from device to host
369   PetscFunctionReturn(0);
370 }
371 
372 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
373 {
374   Mat_Product          *product = C->product;
375   Mat                  A,B;
376   MatProductType       ptype;
377   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
378   PetscInt             m,n,k;
379   bool                 tA,tB;
380   PetscErrorCode       ierr;
381   Mat_SeqAIJ           *c;
382   MatMatKernelHandle_t *kh;
383 
384   PetscFunctionBegin;
385   MatCheckProduct(C,1);
386   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
387   A = product->A;
388   B = product->B;
389   // TODO only copy the i,j data, not the values
390   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
391   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
392   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
393   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
394   ptype = product->type;
395   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
396   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
397   switch (ptype) {
398   case MATPRODUCT_AB:
399     tA = false;
400     tB = false;
401     m = A->rmap->n;
402     n = B->cmap->n;
403     break;
404   case MATPRODUCT_AtB:
405     tA = true;
406     tB = false;
407     m = A->cmap->n;
408     n = B->cmap->n;
409     break;
410   case MATPRODUCT_ABt:
411     tA = false;
412     tB = true;
413     m = A->rmap->n;
414     n = B->rmap->n;
415     break;
416   default:
417     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
418   }
419   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
420   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
421   c = (Mat_SeqAIJ*)C->data;
422 
423   kh = new MatMatKernelHandle_t;
424   // TODO SZ: ADD RUNTIME SELECTION OF THESE
425   kh->set_team_work_size(16);
426   kh->set_dynamic_scheduling(true);
427   // Select an spgemm algorithm, limited by configuration at compile-time and
428   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
429   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
430   std::string myalg("SPGEMM_KK_MEMORY");
431   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
432 
433   /////////////////////////////////////
434   // TODO JZ
435   ckok = NULL; //new Mat_SeqAIJKokkos();
436   C->spptr = ckok;
437   KokkosCsrMatrix_t ccsr; // here only to have the code compile
438   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
439   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
440   //c->nz = get_nnz_from_ccsr
441   //////////////////////////////////////
442   c->singlemalloc = PETSC_FALSE;
443   c->free_a       = PETSC_TRUE;
444   c->free_ij      = PETSC_TRUE;
445   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
446   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
447   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
448   ////////////////////////////////////
449   // TODO JZ copy from device to c->i and c->j
450   ////////////////////////////////////
451   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
452   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
453   c->maxnz = c->nz;
454   c->nonzerorowcnt = 0;
455   c->rmax = 0;
456   for (k = 0; k < m; k++) {
457     const PetscInt nn = c->i[k+1] - c->i[k];
458     c->ilen[k] = c->imax[k] = nn;
459     c->nonzerorowcnt += (PetscInt)!!nn;
460     c->rmax = PetscMax(c->rmax,nn);
461   }
462 
463   C->nonzerostate++;
464   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
465   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
466   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
467   ckok->nonzerostate = C->nonzerostate;
468   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
469   C->preallocated  = PETSC_TRUE;
470   C->assembled     = PETSC_FALSE;
471   C->was_assembled = PETSC_FALSE;
472 
473   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
474   C->product->data = kh;
475   C->product->destroy = MatMatKernelHandleDestroy_Private;
476   PetscFunctionReturn(0);
477 }
478 
479 /* handles sparse matrix matrix ops */
480 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
481 {
482   Mat_Product    *product = mat->product;
483   PetscErrorCode ierr;
484   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
485 
486   PetscFunctionBegin;
487   MatCheckProduct(mat,1);
488   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
489   if (product->type == MATPRODUCT_ABC) {
490     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
491   }
492   if (Biskok && Ciskok) {
493     switch (product->type) {
494     case MATPRODUCT_AB:
495     case MATPRODUCT_AtB:
496     case MATPRODUCT_ABt:
497       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
498       break;
499     case MATPRODUCT_PtAP:
500     case MATPRODUCT_RARt:
501     case MATPRODUCT_ABC:
502       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
503       break;
504     default:
505       break;
506     }
507   } else { /* fallback for AIJ */
508     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
509   }
510   PetscFunctionReturn(0);
511 }
512 #endif
513 
514 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
515 {
516   PetscErrorCode ierr;
517 
518   PetscFunctionBegin;
519   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
520   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
521   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
526 {
527   PetscErrorCode    ierr;
528   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
529   PetscFunctionBegin;
530   if (aijkok && aijkok->device_mat_d.data()) {
531     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
532   }
533   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
538 {
539   PetscErrorCode   ierr;
540   Mat_SeqAIJKokkos *aijkok;
541 
542   PetscFunctionBegin;
543   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
546   A->offloadmask = PETSC_OFFLOAD_GPU;
547   ierr = WaitForKokkos();CHKERRQ(ierr);
548   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
549   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
550   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 
554 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
555 {
556   PetscErrorCode   ierr;
557   PetscBool        both = PETSC_FALSE;
558   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
559   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
560 
561   PetscFunctionBegin;
562   if (aijkok && aijkok->a_d.data()) {
563     KokkosBlas::fill(aijkok->a_d,0.0);
564     both = PETSC_TRUE;
565   }
566   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
567   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
568   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
569   else A->offloadmask = PETSC_OFFLOAD_CPU;
570   PetscFunctionReturn(0);
571 }
572 
573 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
574 {
575   PetscErrorCode ierr;
576   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
577 
578   PetscFunctionBegin;
579   if (X->ops->axpy != Y->ops->axpy) {
580     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
581     PetscFunctionReturn(0);
582   }
583   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
584     PetscBool e;
585     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
586     if (e) {
587       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
588       if (e) str = SAME_NONZERO_PATTERN;
589     }
590   }
591   if (str != SAME_NONZERO_PATTERN) {
592     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
593     PetscFunctionReturn(0);
594   } else {
595     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
596       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
597       PetscFunctionReturn(0);
598     } else {
599       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
600       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
601     }
602     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
603     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
604     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
605       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
606       Y->offloadmask = PETSC_OFFLOAD_GPU;
607       ierr = WaitForKokkos();CHKERRQ(ierr);
608       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
609       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
610       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
611     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
612   }
613   PetscFunctionReturn(0);
614 }
615 
616 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
617 {
618   PetscFunctionBegin;
619   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
620   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
621   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
622   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
623   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
624   A->ops->scale                     = MatScale_SeqAIJKokkos;
625   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
626   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
627   A->ops->mult                      = MatMult_SeqAIJKokkos;
628   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
629   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
630   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
631   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
632   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
633   PetscFunctionReturn(0);
634 }
635 
636 /* --------------------------------------------------------------------------------*/
637 /*C
638    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
639    (the default parallel PETSc format). This matrix will ultimately be handled by
640    Kokkos for calculations. For good matrix
641    assembly performance the user should preallocate the matrix storage by setting
642    the parameter nz (or the array nnz).  By setting these parameters accurately,
643    performance during matrix assembly can be increased by more than a factor of 50.
644 
645    Collective
646 
647    Input Parameters:
648 +  comm - MPI communicator, set to PETSC_COMM_SELF
649 .  m - number of rows
650 .  n - number of columns
651 .  nz - number of nonzeros per row (same for all rows)
652 -  nnz - array containing the number of nonzeros in the various rows
653          (possibly different for each row) or NULL
654 
655    Output Parameter:
656 .  A - the matrix
657 
658    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
659    MatXXXXSetPreallocation() paradgm instead of this routine directly.
660    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
661 
662    Notes:
663    If nnz is given then nz is ignored
664 
665    The AIJ format (also called the Yale sparse matrix format or
666    compressed row storage), is fully compatible with standard Fortran 77
667    storage.  That is, the stored row and column indices can begin at
668    either one (as in Fortran) or zero.  See the users' manual for details.
669 
670    Specify the preallocated storage with either nz or nnz (not both).
671    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
672    allocation.  For large problems you MUST preallocate memory or you
673    will get TERRIBLE performance, see the users' manual chapter on matrices.
674 
675    By default, this format uses inodes (identical nodes) when possible, to
676    improve numerical efficiency of matrix-vector products and solves. We
677    search for consecutive rows with the same nonzero structure, thereby
678    reusing matrix information to achieve increased efficiency.
679 
680    Level: intermediate
681 
682 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
683 @*/
684 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
685 {
686   PetscErrorCode ierr;
687 
688   PetscFunctionBegin;
689   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
690   ierr = MatCreate(comm,A);CHKERRQ(ierr);
691   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
692   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
693   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696