xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 2e4af2ae5ea14e06d4fbd1ab1c02cabcc89ffdd5)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spmv.hpp>
11 #include <../src/mat/impls/aij/seq/aij.h>
12 
13 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14 #include <petscmat.h>
15 
16 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
17 
18 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
19 {
20   PetscErrorCode    ierr;
21   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
22 
23   PetscFunctionBegin;
24   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25   if (aijkok && aijkok->device_mat_d.data()) {
26     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27   }
28   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
29   PetscFunctionReturn(0);
30 }
31 
32 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
33 {
34   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
35   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
36   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
37 
38   PetscFunctionBegin;
39   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
40   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
41   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
42     delete aijkok;
43     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
44     aijkok->nonzerostate = A->nonzerostate;
45     A->spptr             = aijkok;
46   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
47     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
48   }
49   A->offloadmask = PETSC_OFFLOAD_BOTH;
50   PetscFunctionReturn(0);
51 }
52 
53 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54 {
55   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56 
57   PetscFunctionBegin;
58   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62     A->offloadmask = PETSC_OFFLOAD_BOTH;
63   }
64   PetscFunctionReturn(0);
65 }
66 
67 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68 {
69   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70   PetscErrorCode ierr;
71 
72   PetscFunctionBegin;
73   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74   *array = a->a;
75   A->offloadmask = PETSC_OFFLOAD_CPU;
76   PetscFunctionReturn(0);
77 }
78 
79 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81 {
82   Mat_SeqAIJKokkos *aijkok;
83   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84 
85   PetscFunctionBegin;
86   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91   PetscFunctionReturn(0);
92 }
93 
94 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96 {
97   Mat_SeqAIJKokkos *aijkok;
98 
99   PetscFunctionBegin;
100   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101   if (aijkok && aijkok->device_mat_d.data()) {
102     *d_mat = aijkok->device_mat_d.data();
103   } else {
104     PetscErrorCode   ierr;
105     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106     *d_mat  = NULL;
107   }
108   PetscFunctionReturn(0);
109 }
110 
111 /* y = A x */
112 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
113 {
114   PetscErrorCode                   ierr;
115   Mat_SeqAIJKokkos                 *aijkok;
116   ConstPetscScalarViewDevice_t     xv;
117   PetscScalarViewDevice_t          yv;
118 
119   PetscFunctionBegin;
120   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
121   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
122   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
123   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
124   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
125   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
126   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
127   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128   ierr   = WaitForKokkos();CHKERRQ(ierr);
129   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
130   PetscFunctionReturn(0);
131 }
132 
133 /* y = A^T x */
134 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
135 {
136   PetscErrorCode                   ierr;
137   Mat_SeqAIJKokkos                 *aijkok;
138   ConstPetscScalarViewDevice_t     xv;
139   PetscScalarViewDevice_t          yv;
140 
141   PetscFunctionBegin;
142   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
143   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
144   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
145   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
147   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
148   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149   ierr = WaitForKokkos();CHKERRQ(ierr);
150   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
151   PetscFunctionReturn(0);
152 }
153 
154 /* y = A^H x */
155 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
156 {
157   PetscErrorCode                   ierr;
158   Mat_SeqAIJKokkos                 *aijkok;
159   ConstPetscScalarViewDevice_t     xv;
160   PetscScalarViewDevice_t          yv;
161 
162   PetscFunctionBegin;
163   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
166   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
167   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
168   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
169   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170   ierr = WaitForKokkos();CHKERRQ(ierr);
171   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
175 /* z = A x + y */
176 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarViewDevice_t     xv,yv;
181   PetscScalarViewDevice_t          zv;
182 
183   PetscFunctionBegin;
184   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
186   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
187   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
188   if (zz != yy) Kokkos::deep_copy(zv,yv);
189   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
190   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
191   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
192   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
193   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194   ierr = WaitForKokkos();CHKERRQ(ierr);
195   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
196   PetscFunctionReturn(0);
197 }
198 
199 /* z = A^T x + y */
200 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
201 {
202   PetscErrorCode                   ierr;
203   Mat_SeqAIJKokkos                 *aijkok;
204   ConstPetscScalarViewDevice_t     xv,yv;
205   PetscScalarViewDevice_t          zv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
210   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
211   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
212   if (zz != yy) Kokkos::deep_copy(zv,yv);
213   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
214   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
215   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
216   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
217   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218   ierr = WaitForKokkos();CHKERRQ(ierr);
219   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
223 /* z = A^H x + y */
224 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
225 {
226   PetscErrorCode                   ierr;
227   Mat_SeqAIJKokkos                 *aijkok;
228   ConstPetscScalarViewDevice_t     xv,yv;
229   PetscScalarViewDevice_t          zv;
230 
231   PetscFunctionBegin;
232   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
233   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
234   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
235   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
236   if (zz != yy) Kokkos::deep_copy(zv,yv);
237   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
238   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
239   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
240   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
241   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242   ierr = WaitForKokkos();CHKERRQ(ierr);
243   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
244   PetscFunctionReturn(0);
245 }
246 
247 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
248 {
249   PetscErrorCode ierr;
250   Mat            B;
251   Mat_SeqAIJ     *aij;
252 
253   PetscFunctionBegin;
254   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
255   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
256     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
257   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
258     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
259   }
260 
261   B    = *newmat;
262   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
263   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
264   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
265   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268   aij  = (Mat_SeqAIJ*)B->data;
269   aij->inode.use = PETSC_FALSE;
270 
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   PetscFunctionReturn(0);
273 }
274 
275 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
276 {
277   PetscErrorCode ierr;
278 
279   PetscFunctionBegin;
280   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
281   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
286 {
287   PetscErrorCode        ierr;
288   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
289 
290   PetscFunctionBegin;
291   if (aijkok && aijkok->device_mat_d.data()) {
292     delete aijkok->colmap_d;
293     delete aijkok->i_uncompressed_d;
294   }
295   delete aijkok;
296   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
297   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
298   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 #if 0
303 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
304 {
305   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
306 
307   PetscFunctionBegin;
308   delete kh;
309   PetscFunctionReturn(0);
310 }
311 
312 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
313 {
314   Mat_Product          *product = C->product;
315   Mat                  A,B;
316   MatProductType       ptype;
317   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
318   bool                 tA,tB;
319   PetscErrorCode       ierr;
320   MatMatKernelHandle_t *kh;
321   Mat_SeqAIJ           *c;
322 
323   PetscFunctionBegin;
324   MatCheckProduct(C,1);
325   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
326   A = product->A;
327   B = product->B;
328   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
330   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
331   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
332   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
333   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
334   ptype = product->type;
335   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
336   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
337   switch (ptype) {
338   case MATPRODUCT_AB:
339     tA = false;
340     tB = false;
341     break;
342   case MATPRODUCT_AtB:
343     tA = true;
344     tB = false;
345     break;
346   case MATPRODUCT_ABt:
347     tA = false;
348     tB = true;
349     break;
350   default:
351     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
352   }
353 
354   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
355   C->offloadmask = PETSC_OFFLOAD_GPU;
356   /* shorter version of MatAssemblyEnd_SeqAIJ */
357   c = (Mat_SeqAIJ*)C->data;
358   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
359   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
360   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
361   c->reallocs         = 0;
362   C->info.mallocs    += 0;
363   C->info.nz_unneeded = 0;
364   C->assembled = C->was_assembled = PETSC_TRUE;
365   C->num_ass++;
366   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
367   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
368   C->offloadmask = PETSC_OFFLOAD_BOTH;
369   // Also, we should add support to copy back from device to host
370   PetscFunctionReturn(0);
371 }
372 
373 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
374 {
375   Mat_Product          *product = C->product;
376   Mat                  A,B;
377   MatProductType       ptype;
378   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
379   PetscInt             m,n,k;
380   bool                 tA,tB;
381   PetscErrorCode       ierr;
382   Mat_SeqAIJ           *c;
383   MatMatKernelHandle_t *kh;
384 
385   PetscFunctionBegin;
386   MatCheckProduct(C,1);
387   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
388   A = product->A;
389   B = product->B;
390   // TODO only copy the i,j data, not the values
391   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
392   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
393   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
394   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
395   ptype = product->type;
396   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
397   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
398   switch (ptype) {
399   case MATPRODUCT_AB:
400     tA = false;
401     tB = false;
402     m = A->rmap->n;
403     n = B->cmap->n;
404     break;
405   case MATPRODUCT_AtB:
406     tA = true;
407     tB = false;
408     m = A->cmap->n;
409     n = B->cmap->n;
410     break;
411   case MATPRODUCT_ABt:
412     tA = false;
413     tB = true;
414     m = A->rmap->n;
415     n = B->rmap->n;
416     break;
417   default:
418     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
419   }
420   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
421   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
422   c = (Mat_SeqAIJ*)C->data;
423 
424   kh = new MatMatKernelHandle_t;
425   // TODO SZ: ADD RUNTIME SELECTION OF THESE
426   kh->set_team_work_size(16);
427   kh->set_dynamic_scheduling(true);
428   // Select an spgemm algorithm, limited by configuration at compile-time and
429   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
430   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
431   std::string myalg("SPGEMM_KK_MEMORY");
432   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
433 
434   /////////////////////////////////////
435   // TODO JZ
436   ckok = NULL; //new Mat_SeqAIJKokkos();
437   C->spptr = ckok;
438   KokkosCsrMatrix_t ccsr; // here only to have the code compile
439   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
440   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
441   //c->nz = get_nnz_from_ccsr
442   //////////////////////////////////////
443   c->singlemalloc = PETSC_FALSE;
444   c->free_a       = PETSC_TRUE;
445   c->free_ij      = PETSC_TRUE;
446   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
447   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
448   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
449   ////////////////////////////////////
450   // TODO JZ copy from device to c->i and c->j
451   ////////////////////////////////////
452   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
453   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
454   c->maxnz = c->nz;
455   c->nonzerorowcnt = 0;
456   c->rmax = 0;
457   for (k = 0; k < m; k++) {
458     const PetscInt nn = c->i[k+1] - c->i[k];
459     c->ilen[k] = c->imax[k] = nn;
460     c->nonzerorowcnt += (PetscInt)!!nn;
461     c->rmax = PetscMax(c->rmax,nn);
462   }
463 
464   C->nonzerostate++;
465   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
466   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
467   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
468   ckok->nonzerostate = C->nonzerostate;
469   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
470   C->preallocated  = PETSC_TRUE;
471   C->assembled     = PETSC_FALSE;
472   C->was_assembled = PETSC_FALSE;
473 
474   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
475   C->product->data = kh;
476   C->product->destroy = MatMatKernelHandleDestroy_Private;
477   PetscFunctionReturn(0);
478 }
479 
480 /* handles sparse matrix matrix ops */
481 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
482 {
483   Mat_Product    *product = mat->product;
484   PetscErrorCode ierr;
485   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
486 
487   PetscFunctionBegin;
488   MatCheckProduct(mat,1);
489   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
490   if (product->type == MATPRODUCT_ABC) {
491     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
492   }
493   if (Biskok && Ciskok) {
494     switch (product->type) {
495     case MATPRODUCT_AB:
496     case MATPRODUCT_AtB:
497     case MATPRODUCT_ABt:
498       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
499       break;
500     case MATPRODUCT_PtAP:
501     case MATPRODUCT_RARt:
502     case MATPRODUCT_ABC:
503       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
504       break;
505     default:
506       break;
507     }
508   } else { /* fallback for AIJ */
509     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
510   }
511   PetscFunctionReturn(0);
512 }
513 #endif
514 
515 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
516 {
517   PetscErrorCode ierr;
518 
519   PetscFunctionBegin;
520   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
521   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
522   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
527 {
528   PetscErrorCode    ierr;
529   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
530   PetscFunctionBegin;
531   if (aijkok && aijkok->device_mat_d.data()) {
532     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
533   }
534   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
535   PetscFunctionReturn(0);
536 }
537 
538 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
539 {
540   PetscErrorCode   ierr;
541   Mat_SeqAIJKokkos *aijkok;
542 
543   PetscFunctionBegin;
544   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
545   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
547   A->offloadmask = PETSC_OFFLOAD_GPU;
548   ierr = WaitForKokkos();CHKERRQ(ierr);
549   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
550   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
551   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
552   PetscFunctionReturn(0);
553 }
554 
555 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
556 {
557   PetscErrorCode   ierr;
558   PetscBool        both = PETSC_FALSE;
559   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
560   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
561 
562   PetscFunctionBegin;
563   if (aijkok && aijkok->a_d.data()) {
564     KokkosBlas::fill(aijkok->a_d,0.0);
565     both = PETSC_TRUE;
566   }
567   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
568   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
569   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
570   else A->offloadmask = PETSC_OFFLOAD_CPU;
571   PetscFunctionReturn(0);
572 }
573 
574 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
575 {
576   PetscErrorCode ierr;
577   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
578 
579   PetscFunctionBegin;
580   if (X->ops->axpy != Y->ops->axpy) {
581     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
582     PetscFunctionReturn(0);
583   }
584   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
585     PetscBool e;
586     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
587     if (e) {
588       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
589       if (e) str = SAME_NONZERO_PATTERN;
590     }
591   }
592   if (str != SAME_NONZERO_PATTERN) {
593     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
594     PetscFunctionReturn(0);
595   } else {
596     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
597       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
598       PetscFunctionReturn(0);
599     } else {
600       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
601       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
602     }
603     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
604     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
605     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
606       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
607       Y->offloadmask = PETSC_OFFLOAD_GPU;
608       ierr = WaitForKokkos();CHKERRQ(ierr);
609       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
610       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
611       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
612     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
613   }
614   PetscFunctionReturn(0);
615 }
616 
617 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
618 {
619   PetscFunctionBegin;
620   A->boundtocpu = PETSC_FALSE;
621 
622   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
623   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
624   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
625   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
626   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
627   A->ops->scale                     = MatScale_SeqAIJKokkos;
628   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
629   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
630   A->ops->mult                      = MatMult_SeqAIJKokkos;
631   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
632   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
633   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
634   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
635   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
636   PetscFunctionReturn(0);
637 }
638 
639 /* --------------------------------------------------------------------------------*/
640 /*C
641    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
642    (the default parallel PETSc format). This matrix will ultimately be handled by
643    Kokkos for calculations. For good matrix
644    assembly performance the user should preallocate the matrix storage by setting
645    the parameter nz (or the array nnz).  By setting these parameters accurately,
646    performance during matrix assembly can be increased by more than a factor of 50.
647 
648    Collective
649 
650    Input Parameters:
651 +  comm - MPI communicator, set to PETSC_COMM_SELF
652 .  m - number of rows
653 .  n - number of columns
654 .  nz - number of nonzeros per row (same for all rows)
655 -  nnz - array containing the number of nonzeros in the various rows
656          (possibly different for each row) or NULL
657 
658    Output Parameter:
659 .  A - the matrix
660 
661    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
662    MatXXXXSetPreallocation() paradgm instead of this routine directly.
663    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
664 
665    Notes:
666    If nnz is given then nz is ignored
667 
668    The AIJ format (also called the Yale sparse matrix format or
669    compressed row storage), is fully compatible with standard Fortran 77
670    storage.  That is, the stored row and column indices can begin at
671    either one (as in Fortran) or zero.  See the users' manual for details.
672 
673    Specify the preallocated storage with either nz or nnz (not both).
674    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
675    allocation.  For large problems you MUST preallocate memory or you
676    will get TERRIBLE performance, see the users' manual chapter on matrices.
677 
678    By default, this format uses inodes (identical nodes) when possible, to
679    improve numerical efficiency of matrix-vector products and solves. We
680    search for consecutive rows with the same nonzero structure, thereby
681    reusing matrix information to achieve increased efficiency.
682 
683    Level: intermediate
684 
685 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
686 @*/
687 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
688 {
689   PetscErrorCode ierr;
690 
691   PetscFunctionBegin;
692   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
693   ierr = MatCreate(comm,A);CHKERRQ(ierr);
694   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
695   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
696   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
697   PetscFunctionReturn(0);
698 }
699 
700 // factorizations
701 
702 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
703 {
704   // Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
705   // IS             isrow = b->row,iscol = b->col;
706   // PetscBool      row_identity,col_identity;
707   PetscErrorCode ierr;
708 
709   PetscFunctionBegin;
710   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
711   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
712   B->offloadmask = PETSC_OFFLOAD_CPU;
713   // solves stay on CPU now
714   /* determine which version of MatSolve needs to be used. */
715   // ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
716   // ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
717   // if (row_identity && col_identity) {
718   //   B->ops->solve = MatSolve_SeqAIJKOKKOS_NaturalOrdering;
719   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS_NaturalOrdering;
720   //   B->ops->matsolve = NULL;
721   //   B->ops->matsolvetranspose = NULL;
722   // } else {
723   //   B->ops->solve = MatSolve_SeqAIJKOKKOS;
724   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS;
725   //   B->ops->matsolve = NULL;
726   //   B->ops->matsolvetranspose = NULL;
727   // }
728 
729   /* get the triangular factors */
730   // ierr = MatSeqAIJKOKKOSILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
731   PetscFunctionReturn(0);
732 }
733 
734 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
735 {
736   PetscErrorCode               ierr;
737 
738   PetscFunctionBegin;
739   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
740   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
741   PetscFunctionReturn(0);
742 }
743 
744 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
745 
746 
747 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
748 {
749   PetscErrorCode ierr;
750 
751   PetscFunctionBegin;
752   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
753   PetscFunctionReturn(0);
754 }
755 
756 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
757 {
758   PetscFunctionBegin;
759   *type = MATSOLVERKOKKOS;
760   PetscFunctionReturn(0);
761 }
762 
763 /*MC
764   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
765   on a single GPU of type, seqaijkokkos, aijkokkos.
766 
767   Level: beginner
768 
769 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
770 M*/
771 
772 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
773 {
774   PetscErrorCode ierr;
775   PetscInt       n = A->rmap->n;
776 
777   PetscFunctionBegin;
778   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
779   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
780   (*B)->factortype = ftype;
781   (*B)->useordering = PETSC_TRUE;
782   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
783 
784   if (ftype == MAT_FACTOR_LU /* || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT*/) {
785     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
786     // (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKOKKOS;
787     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
788     // } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
789     // (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKOKKOS;
790     // (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKOKKOS;
791   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
792 
793   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
794   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
795   PetscFunctionReturn(0);
796 }
797