xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 80fdaca06e24329b048de2e274d60da818b72e9c)
1 #include "petsc/private/petscimpl.h"
2 #include <petscsystypes.h>
3 #include <petscerror.h>
4 #include <petscveckokkos.hpp>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosSparse_CrsMatrix.hpp>
8 #include <KokkosSparse_spmv.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <../src/mat/impls/aij/seq/aij.h>
11 
12 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13 #include <petscmat.h>
14 
15 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
16 
17 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
18 {
19   PetscErrorCode    ierr;
20   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
21 
22   PetscFunctionBegin;
23   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
24   if (aijkok && aijkok->device_mat_d.data()) {
25     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
26   }
27   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
28   PetscFunctionReturn(0);
29 }
30 
31 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
32 {
33   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
34   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
35   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
36 
37   PetscFunctionBegin;
38   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
39   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
40   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
41     delete aijkok;
42     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
43     aijkok->nonzerostate = A->nonzerostate;
44     A->spptr             = aijkok;
45   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
46     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
47   }
48   A->offloadmask = PETSC_OFFLOAD_BOTH;
49   PetscFunctionReturn(0);
50 }
51 
52 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
53 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
54 {
55   Mat_SeqAIJKokkos *aijkok;
56   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
57 
58   PetscFunctionBegin;
59   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
60   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
61   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
62   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
63   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
64   PetscFunctionReturn(0);
65 }
66 
67 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
68 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
69 {
70   Mat_SeqAIJKokkos *aijkok;
71 
72   PetscFunctionBegin;
73   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74   if (aijkok && aijkok->device_mat_d.data()) {
75     *d_mat = aijkok->device_mat_d.data();
76   } else {
77     PetscErrorCode   ierr;
78     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
79     *d_mat  = NULL;
80   }
81   PetscFunctionReturn(0);
82 }
83 
84 /* y = A x */
85 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
86 {
87   PetscErrorCode                   ierr;
88   Mat_SeqAIJKokkos                 *aijkok;
89   ConstPetscScalarViewDevice_t     xv;
90   PetscScalarViewDevice_t          yv;
91 
92   PetscFunctionBegin;
93   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
94   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
95   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
96   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
97   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
98   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
99   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
100   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
101   ierr   = WaitForKokkos();CHKERRQ(ierr);
102   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 /* y = A^T x */
107 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
108 {
109   PetscErrorCode                   ierr;
110   Mat_SeqAIJKokkos                 *aijkok;
111   ConstPetscScalarViewDevice_t     xv;
112   PetscScalarViewDevice_t          yv;
113 
114   PetscFunctionBegin;
115   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
116   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
117   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
118   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
120   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
121   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
122   ierr = WaitForKokkos();CHKERRQ(ierr);
123   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
124   PetscFunctionReturn(0);
125 }
126 
127 /* y = A^H x */
128 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
129 {
130   PetscErrorCode                   ierr;
131   Mat_SeqAIJKokkos                 *aijkok;
132   ConstPetscScalarViewDevice_t     xv;
133   PetscScalarViewDevice_t          yv;
134 
135   PetscFunctionBegin;
136   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
137   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
138   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
139   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
140   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
141   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
142   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
143   ierr = WaitForKokkos();CHKERRQ(ierr);
144   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /* z = A x + y */
149 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
150 {
151   PetscErrorCode                   ierr;
152   Mat_SeqAIJKokkos                 *aijkok;
153   ConstPetscScalarViewDevice_t     xv,yv;
154   PetscScalarViewDevice_t          zv;
155 
156   PetscFunctionBegin;
157   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
158   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
159   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
160   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
161   if (zz != yy) Kokkos::deep_copy(zv,yv);
162   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
164   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
165   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
166   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
167   ierr = WaitForKokkos();CHKERRQ(ierr);
168   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
169   PetscFunctionReturn(0);
170 }
171 
172 /* z = A^T x + y */
173 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
174 {
175   PetscErrorCode                   ierr;
176   Mat_SeqAIJKokkos                 *aijkok;
177   ConstPetscScalarViewDevice_t     xv,yv;
178   PetscScalarViewDevice_t          zv;
179 
180   PetscFunctionBegin;
181   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
182   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
183   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
184   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
185   if (zz != yy) Kokkos::deep_copy(zv,yv);
186   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
187   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
188   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
189   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
190   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
191   ierr = WaitForKokkos();CHKERRQ(ierr);
192   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 /* z = A^H x + y */
197 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
198 {
199   PetscErrorCode                   ierr;
200   Mat_SeqAIJKokkos                 *aijkok;
201   ConstPetscScalarViewDevice_t     xv,yv;
202   PetscScalarViewDevice_t          zv;
203 
204   PetscFunctionBegin;
205   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
206   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
207   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
208   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
209   if (zz != yy) Kokkos::deep_copy(zv,yv);
210   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
212   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
213   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
214   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
215   ierr = WaitForKokkos();CHKERRQ(ierr);
216   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
217   PetscFunctionReturn(0);
218 }
219 
220 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
221 {
222   PetscErrorCode ierr;
223   Mat            B;
224   Mat_SeqAIJ     *aij;
225 
226   PetscFunctionBegin;
227   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
228   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
229     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
230   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
231     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
232   }
233 
234   B    = *newmat;
235   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
236   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
237   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
238   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
239   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
240   aij  = (Mat_SeqAIJ*)B->data;
241   aij->inode.use = PETSC_FALSE;
242 
243   B->offloadmask = PETSC_OFFLOAD_CPU;
244   PetscFunctionReturn(0);
245 }
246 
247 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
248 {
249   PetscErrorCode ierr;
250 
251   PetscFunctionBegin;
252   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
253   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
258 {
259   PetscErrorCode        ierr;
260   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
261 
262   PetscFunctionBegin;
263   if (aijkok && aijkok->device_mat_d.data()) {
264     delete aijkok->colmap_d;
265     delete aijkok->i_uncompressed_d;
266   }
267   delete aijkok;
268   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
269   PetscFunctionReturn(0);
270 }
271 
272 #if 0
273 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
274 {
275   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
276 
277   PetscFunctionBegin;
278   delete kh;
279   PetscFunctionReturn(0);
280 }
281 
282 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
283 {
284   Mat_Product          *product = C->product;
285   Mat                  A,B;
286   MatProductType       ptype;
287   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
288   bool                 tA,tB;
289   PetscErrorCode       ierr;
290   MatMatKernelHandle_t *kh;
291   Mat_SeqAIJ           *c;
292 
293   PetscFunctionBegin;
294   MatCheckProduct(C,1);
295   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
296   A = product->A;
297   B = product->B;
298   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
299   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
300   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
301   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
302   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
303   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
304   ptype = product->type;
305   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
306   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
307   switch (ptype) {
308   case MATPRODUCT_AB:
309     tA = false;
310     tB = false;
311     break;
312   case MATPRODUCT_AtB:
313     tA = true;
314     tB = false;
315     break;
316   case MATPRODUCT_ABt:
317     tA = false;
318     tB = true;
319     break;
320   default:
321     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
322   }
323 
324   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
325   C->offloadmask = PETSC_OFFLOAD_GPU;
326   /* shorter version of MatAssemblyEnd_SeqAIJ */
327   c = (Mat_SeqAIJ*)C->data;
328   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
329   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
330   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
331   c->reallocs         = 0;
332   C->info.mallocs    += 0;
333   C->info.nz_unneeded = 0;
334   C->assembled = C->was_assembled = PETSC_TRUE;
335   C->num_ass++;
336   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
337   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
338   C->offloadmask = PETSC_OFFLOAD_BOTH;
339   // Also, we should add support to copy back from device to host
340   PetscFunctionReturn(0);
341 }
342 
343 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
344 {
345   Mat_Product          *product = C->product;
346   Mat                  A,B;
347   MatProductType       ptype;
348   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
349   PetscInt             m,n,k;
350   bool                 tA,tB;
351   PetscErrorCode       ierr;
352   Mat_SeqAIJ           *c;
353   MatMatKernelHandle_t *kh;
354 
355   PetscFunctionBegin;
356   MatCheckProduct(C,1);
357   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
358   A = product->A;
359   B = product->B;
360   // TODO only copy the i,j data, not the values
361   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
362   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
363   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
364   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
365   ptype = product->type;
366   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
367   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
368   switch (ptype) {
369   case MATPRODUCT_AB:
370     tA = false;
371     tB = false;
372     m = A->rmap->n;
373     n = B->cmap->n;
374     break;
375   case MATPRODUCT_AtB:
376     tA = true;
377     tB = false;
378     m = A->cmap->n;
379     n = B->cmap->n;
380     break;
381   case MATPRODUCT_ABt:
382     tA = false;
383     tB = true;
384     m = A->rmap->n;
385     n = B->rmap->n;
386     break;
387   default:
388     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
389   }
390   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
391   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
392   c = (Mat_SeqAIJ*)C->data;
393 
394   kh = new MatMatKernelHandle_t;
395   // TODO SZ: ADD RUNTIME SELECTION OF THESE
396   kh->set_team_work_size(16);
397   kh->set_dynamic_scheduling(true);
398   // Select an spgemm algorithm, limited by configuration at compile-time and
399   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
400   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
401   std::string myalg("SPGEMM_KK_MEMORY");
402   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
403 
404   /////////////////////////////////////
405   // TODO JZ
406   ckok = NULL; //new Mat_SeqAIJKokkos();
407   C->spptr = ckok;
408   KokkosCsrMatrix_t ccsr; // here only to have the code compile
409   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
410   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
411   //c->nz = get_nnz_from_ccsr
412   //////////////////////////////////////
413   c->singlemalloc = PETSC_FALSE;
414   c->free_a       = PETSC_TRUE;
415   c->free_ij      = PETSC_TRUE;
416   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
417   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
418   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
419   ////////////////////////////////////
420   // TODO JZ copy from device to c->i and c->j
421   ////////////////////////////////////
422   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
423   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
424   c->maxnz = c->nz;
425   c->nonzerorowcnt = 0;
426   c->rmax = 0;
427   for (k = 0; k < m; k++) {
428     const PetscInt nn = c->i[k+1] - c->i[k];
429     c->ilen[k] = c->imax[k] = nn;
430     c->nonzerorowcnt += (PetscInt)!!nn;
431     c->rmax = PetscMax(c->rmax,nn);
432   }
433 
434   C->nonzerostate++;
435   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
436   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
437   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
438   ckok->nonzerostate = C->nonzerostate;
439   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
440   C->preallocated  = PETSC_TRUE;
441   C->assembled     = PETSC_FALSE;
442   C->was_assembled = PETSC_FALSE;
443 
444   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
445   C->product->data = kh;
446   C->product->destroy = MatMatKernelHandleDestroy_Private;
447   PetscFunctionReturn(0);
448 }
449 
450 /* handles sparse matrix matrix ops */
451 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
452 {
453   Mat_Product    *product = mat->product;
454   PetscErrorCode ierr;
455   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
456 
457   PetscFunctionBegin;
458   MatCheckProduct(mat,1);
459   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
460   if (product->type == MATPRODUCT_ABC) {
461     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
462   }
463   if (Biskok && Ciskok) {
464     switch (product->type) {
465     case MATPRODUCT_AB:
466     case MATPRODUCT_AtB:
467     case MATPRODUCT_ABt:
468       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
469       break;
470     case MATPRODUCT_PtAP:
471     case MATPRODUCT_RARt:
472     case MATPRODUCT_ABC:
473       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
474       break;
475     default:
476       break;
477     }
478   } else { /* fallback for AIJ */
479     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
480   }
481   PetscFunctionReturn(0);
482 }
483 #endif
484 
485 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
486 {
487   PetscErrorCode ierr;
488 
489   PetscFunctionBegin;
490   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
491   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
492   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
493   PetscFunctionReturn(0);
494 }
495 
496 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
497 {
498   PetscErrorCode    ierr;
499   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
500   PetscFunctionBegin;
501   if (aijkok && aijkok->device_mat_d.data()) {
502     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
503   }
504   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
505   PetscFunctionReturn(0);
506 }
507 
508 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
509 {
510   PetscErrorCode             ierr;
511   PetscBool                  both = PETSC_FALSE;
512   Mat_SeqAIJKokkos           *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
513   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
514 
515   PetscFunctionBegin;
516   if (aijkok && aijkok->a_d.data()) {
517     Kokkos::parallel_for (aijkok->a_d.size(), KOKKOS_LAMBDA (int i) { aijkok->a_d(i) = 0; });
518     both = PETSC_TRUE;
519   }
520   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
521   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
522   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
523   else A->offloadmask = PETSC_OFFLOAD_CPU;
524 
525   PetscFunctionReturn(0);
526 }
527 
528 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
529 {
530   PetscErrorCode ierr;
531   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
532   PetscBool      flgx,flgy;
533 
534   PetscFunctionBegin;
535   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
536   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
537   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
538   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJKOKKOS,&flgy);CHKERRQ(ierr);
539   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJKOKKOS,&flgx);CHKERRQ(ierr);
540   if (!flgx || !flgy) {
541     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
542     PetscFunctionReturn(0);
543   }
544   if (str == DIFFERENT_NONZERO_PATTERN) {
545     if (x->nz == y->nz) {
546       PetscBool e;
547       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
548       if (e) {
549         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
550         if (e) {
551           str = SAME_NONZERO_PATTERN;
552         }
553       }
554     }
555   }
556   if (str != SAME_NONZERO_PATTERN) {
557     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
558     PetscFunctionReturn(0);
559   } else {
560     if (Y->offloadmask==PETSC_OFFLOAD_CPU && X->offloadmask==PETSC_OFFLOAD_CPU) {
561       ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
562       PetscFunctionReturn(0);
563     } else if ((Y->offloadmask==PETSC_OFFLOAD_GPU || Y->offloadmask==PETSC_OFFLOAD_BOTH) && X->offloadmask == PETSC_OFFLOAD_CPU) {
564       ierr    = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
565     } else if ((X->offloadmask==PETSC_OFFLOAD_GPU || X->offloadmask==PETSC_OFFLOAD_BOTH) && Y->offloadmask == PETSC_OFFLOAD_CPU) {
566       ierr    = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); /* promote Y */
567     }
568     {
569       Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
570       Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
571       if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
572         Kokkos::parallel_for (aijkokY->a_d.size(), KOKKOS_LAMBDA (int i) { aijkokY->a_d(i) += a*aijkokX->a_d(i); });
573         Kokkos::deep_copy(aijkokY->a_h,aijkokY->a_d); // we could not copy and keep GPU
574         Y->offloadmask = PETSC_OFFLOAD_BOTH;
575         ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
576       } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
577     }
578   }
579   PetscFunctionReturn(0);
580 }
581 
582 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
583 {
584   PetscFunctionBegin;
585   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
586   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
587   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
588   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
589   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
590   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
591   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
592   A->ops->mult                      = MatMult_SeqAIJKokkos;
593   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
594   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
595   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
596   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
597   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
598   PetscFunctionReturn(0);
599 }
600 
601 /* --------------------------------------------------------------------------------*/
602 /*C
603    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
604    (the default parallel PETSc format). This matrix will ultimately be handled by
605    Kokkos for calculations. For good matrix
606    assembly performance the user should preallocate the matrix storage by setting
607    the parameter nz (or the array nnz).  By setting these parameters accurately,
608    performance during matrix assembly can be increased by more than a factor of 50.
609 
610    Collective
611 
612    Input Parameters:
613 +  comm - MPI communicator, set to PETSC_COMM_SELF
614 .  m - number of rows
615 .  n - number of columns
616 .  nz - number of nonzeros per row (same for all rows)
617 -  nnz - array containing the number of nonzeros in the various rows
618          (possibly different for each row) or NULL
619 
620    Output Parameter:
621 .  A - the matrix
622 
623    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
624    MatXXXXSetPreallocation() paradgm instead of this routine directly.
625    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
626 
627    Notes:
628    If nnz is given then nz is ignored
629 
630    The AIJ format (also called the Yale sparse matrix format or
631    compressed row storage), is fully compatible with standard Fortran 77
632    storage.  That is, the stored row and column indices can begin at
633    either one (as in Fortran) or zero.  See the users' manual for details.
634 
635    Specify the preallocated storage with either nz or nnz (not both).
636    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
637    allocation.  For large problems you MUST preallocate memory or you
638    will get TERRIBLE performance, see the users' manual chapter on matrices.
639 
640    By default, this format uses inodes (identical nodes) when possible, to
641    improve numerical efficiency of matrix-vector products and solves. We
642    search for consecutive rows with the same nonzero structure, thereby
643    reusing matrix information to achieve increased efficiency.
644 
645    Level: intermediate
646 
647 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
648 @*/
649 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
650 {
651   PetscErrorCode ierr;
652 
653   PetscFunctionBegin;
654   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
655   ierr = MatCreate(comm,A);CHKERRQ(ierr);
656   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
657   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
658   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661