xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision cbd2c53bc8d4e32f1aab714f173fd0b4c821fa72)
1 #include <petscvec_kokkos.hpp>
2 #include <petsc/private/petscimpl.h>
3 #include <petscsystypes.h>
4 #include <petscerror.h>
5 
6 #include <Kokkos_Core.hpp>
7 #include <KokkosBlas.hpp>
8 #include <KokkosSparse_CrsMatrix.hpp>
9 #include <KokkosSparse_spmv.hpp>
10 #include <KokkosSparse_spiluk.hpp>
11 #include <KokkosSparse_sptrsv.hpp>
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
16 #include <petscmat.h>
17 
18 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
19 
20 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
21 {
22   PetscErrorCode    ierr;
23   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
24 
25   PetscFunctionBegin;
26   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
27   A->offloadmask = PETSC_OFFLOAD_CPU;
28   if (aijkok && aijkok->device_mat_d.data()) {
29     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
30   }
31   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
32   PetscFunctionReturn(0);
33 }
34 
35 /* Sync CSR data to device if not yet */
36 static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
37 {
38   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
39   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
40   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
41 
42   PetscFunctionBegin;
43   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
44   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
45   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
46     delete aijkok;
47     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
48     aijkok->nonzerostate = A->nonzerostate;
49     A->spptr             = aijkok;
50   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
51     aijkok->a_dual.clear_sync_state();
52     aijkok->a_dual.modify_host(); /* Mark host is modified */
53     aijkok->a_dual.sync_device(); /* Sync the device */
54     aijkok->transpose_updated = PETSC_FALSE;
55     aijkok->hermitian_updated = PETSC_FALSE;
56   }
57   A->offloadmask = PETSC_OFFLOAD_BOTH;
58   PetscFunctionReturn(0);
59 }
60 
61 /* Mark the CSR data on device is modified */
62 static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
63 {
64   PetscErrorCode   ierr;
65   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
66 
67   PetscFunctionBegin;
68   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
69   aijkok->a_dual.clear_sync_state();
70   aijkok->a_dual.modify_device();
71   aijkok->transpose_updated = PETSC_FALSE;
72   aijkok->hermitian_updated = PETSC_FALSE;
73   A->offloadmask = PETSC_OFFLOAD_GPU;
74   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
75   PetscFunctionReturn(0);
76 }
77 
78 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
79 {
80   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
81 
82   PetscFunctionBegin;
83   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
84    /* We do not expect one needs factors on host  */
85   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
86   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
87     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
88     aijkok->a_dual.clear_sync_state();
89     aijkok->a_dual.modify_device(); /* Mark device is modified */
90     aijkok->a_dual.sync_host(); /* Sync the host */
91     A->offloadmask = PETSC_OFFLOAD_BOTH;
92   }
93   PetscFunctionReturn(0);
94 }
95 
96 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
97 {
98   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
99   PetscErrorCode ierr;
100 
101   PetscFunctionBegin;
102   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
103   *array = a->a;
104   A->offloadmask = PETSC_OFFLOAD_CPU;
105   PetscFunctionReturn(0);
106 }
107 
108 // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
109 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
110 {
111   Mat_SeqAIJKokkos *aijkok;
112   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
113 
114   PetscFunctionBegin;
115   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
116   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
117   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
118   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
119   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
120   PetscFunctionReturn(0);
121 }
122 
123 // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
124 PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
125 {
126   Mat_SeqAIJKokkos *aijkok;
127 
128   PetscFunctionBegin;
129   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130   if (aijkok && aijkok->device_mat_d.data()) {
131     *d_mat = aijkok->device_mat_d.data();
132   } else {
133     PetscErrorCode   ierr;
134     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
135     *d_mat  = NULL;
136   }
137   PetscFunctionReturn(0);
138 }
139 static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
140 {
141   PetscErrorCode                   ierr;
142   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
143 
144   PetscFunctionBegin;
145   if (!aijkok->At) { /* Generate At for the first time */
146     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
147     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
148   } else if (!aijkok->transpose_updated) { /* Only update At values */
149     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
150     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
151   }
152   aijkok->transpose_updated = PETSC_TRUE;
153   PetscFunctionReturn(0);
154 }
155 
156 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
157 {
158   PetscErrorCode                   ierr;
159   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160 
161   PetscFunctionBegin;
162   if (!aijkok->Ah) { /* Generate Ah for the first time */
163     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
164     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
165     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
166   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
167     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
168     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
169     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
170   }
171   aijkok->hermitian_updated = PETSC_TRUE;
172   PetscFunctionReturn(0);
173 }
174 
175 /* y = A x */
176 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
177 {
178   PetscErrorCode                   ierr;
179   Mat_SeqAIJKokkos                 *aijkok;
180   ConstPetscScalarKokkosView       xv;
181   PetscScalarKokkosView            yv;
182 
183   PetscFunctionBegin;
184   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
186   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
187   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
189   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
190   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
191   ierr   = WaitForKokkos();CHKERRQ(ierr);
192   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
193   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 }
196 
197 /* y = A^T x */
198 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
199 {
200   PetscErrorCode                   ierr;
201   Mat_SeqAIJKokkos                 *aijkok;
202   Mat                              B;
203   const char                       *mode;
204   ConstPetscScalarKokkosView       xv;
205   PetscScalarKokkosView            yv;
206 
207   PetscFunctionBegin;
208   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
210   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
211   if (A->form_explicit_transpose) {
212     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
213     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
214     mode = "N";
215   } else {
216     B    = A;
217     mode = "T";
218   }
219   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
220   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
221   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
222   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
223   ierr = WaitForKokkos();CHKERRQ(ierr);
224   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 /* y = A^H x */
229 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
230 {
231   PetscErrorCode                   ierr;
232   Mat_SeqAIJKokkos                 *aijkok;
233   Mat                              B;
234   const char                       *mode;
235   ConstPetscScalarKokkosView       xv;
236   PetscScalarKokkosView            yv;
237 
238   PetscFunctionBegin;
239   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
240   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
241   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
242   if (A->form_explicit_transpose) {
243     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
244     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
245     mode = "N";
246   } else {
247     B    = A;
248     mode = "C";
249   }
250   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
251   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
252   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
253   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
254   ierr = WaitForKokkos();CHKERRQ(ierr);
255   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
259 /* z = A x + y */
260 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
261 {
262   PetscErrorCode                   ierr;
263   Mat_SeqAIJKokkos                 *aijkok;
264   ConstPetscScalarKokkosView       xv,yv;
265   PetscScalarKokkosView            zv;
266 
267   PetscFunctionBegin;
268   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
269   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
270   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
271   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
272   if (zz != yy) Kokkos::deep_copy(zv,yv);
273   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
274   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
275   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
276   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
277   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
278   ierr = WaitForKokkos();CHKERRQ(ierr);
279   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
280   PetscFunctionReturn(0);
281 }
282 
283 /* z = A^T x + y */
284 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
285 {
286   PetscErrorCode                   ierr;
287   Mat_SeqAIJKokkos                 *aijkok;
288   Mat                              B;
289   const char                       *mode;
290   ConstPetscScalarKokkosView       xv,yv;
291   PetscScalarKokkosView            zv;
292 
293   PetscFunctionBegin;
294   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
295   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
296   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
297   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
298   if (zz != yy) Kokkos::deep_copy(zv,yv);
299   if (A->form_explicit_transpose) {
300     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
301     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
302     mode = "N";
303   } else {
304     B    = A;
305     mode = "T";
306   }
307   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
308   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
309   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
310   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
311   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
312   ierr = WaitForKokkos();CHKERRQ(ierr);
313   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 /* z = A^H x + y */
318 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
319 {
320   PetscErrorCode                   ierr;
321   Mat_SeqAIJKokkos                 *aijkok;
322   Mat                              B;
323   const char                       *mode;
324   ConstPetscScalarKokkosView       xv,yv;
325   PetscScalarKokkosView            zv;
326 
327   PetscFunctionBegin;
328   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
330   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
331   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
332   if (zz != yy) Kokkos::deep_copy(zv,yv);
333   if (A->form_explicit_transpose) {
334     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
335     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
336     mode = "N";
337   } else {
338     B    = A;
339     mode = "C";
340   }
341   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
342   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
343   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346   ierr = WaitForKokkos();CHKERRQ(ierr);
347   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348   PetscFunctionReturn(0);
349 }
350 
351 PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
352 {
353   PetscErrorCode            ierr;
354   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355 
356   PetscFunctionBegin;
357   switch (op) {
358     case MAT_FORM_EXPLICIT_TRANSPOSE:
359       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
360       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
361       A->form_explicit_transpose = flg;
362       break;
363     default:
364       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
365       break;
366   }
367   PetscFunctionReturn(0);
368 }
369 
370 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
371 {
372   PetscErrorCode ierr;
373   Mat            B;
374   Mat_SeqAIJ     *aij;
375 
376   PetscFunctionBegin;
377   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
378   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
379     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
380   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
381     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
382   }
383 
384   B    = *newmat;
385   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
386   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
387 
388   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
389   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
390   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
391   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
392   aij  = (Mat_SeqAIJ*)B->data;
393   aij->inode.use = PETSC_FALSE;
394 
395   PetscFunctionReturn(0);
396 }
397 
398 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
399 {
400   PetscErrorCode ierr;
401 
402   PetscFunctionBegin;
403   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
404   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
405   PetscFunctionReturn(0);
406 }
407 
408 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
409 {
410   PetscErrorCode             ierr;
411   Mat_SeqAIJKokkos           *aijkok;
412 
413   PetscFunctionBegin;
414   if (A->factortype == MAT_FACTOR_NONE) {
415     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
416     if (aijkok) {
417       if (aijkok->device_mat_d.data()) {
418         delete aijkok->colmap_d;
419         delete aijkok->i_uncompressed_d;
420       }
421       if (aijkok->diag_d) delete aijkok->diag_d;
422     }
423     delete aijkok;
424   } else {
425     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
426   }
427   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
428   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
429   A->spptr = NULL;
430   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
431   PetscFunctionReturn(0);
432 }
433 
434 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
435 {
436   PetscErrorCode ierr;
437 
438   PetscFunctionBegin;
439   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
440   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
441   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
442   PetscFunctionReturn(0);
443 }
444 
445 #if 0
446 static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
447 {
448   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
449 
450   PetscFunctionBegin;
451   delete kh;
452   PetscFunctionReturn(0);
453 }
454 
455 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
456 {
457   Mat_Product          *product = C->product;
458   Mat                  A,B;
459   MatProductType       ptype;
460   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
461   bool                 tA,tB;
462   PetscErrorCode       ierr;
463   MatMatKernelHandle_t *kh;
464   Mat_SeqAIJ           *c;
465 
466   PetscFunctionBegin;
467   MatCheckProduct(C,1);
468   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
469   A = product->A;
470   B = product->B;
471   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
472   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
473   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
474   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
475   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
476   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
477   ptype = product->type;
478   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
479   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
480   switch (ptype) {
481   case MATPRODUCT_AB:
482     tA = false;
483     tB = false;
484     break;
485   case MATPRODUCT_AtB:
486     tA = true;
487     tB = false;
488     break;
489   case MATPRODUCT_ABt:
490     tA = false;
491     tB = true;
492     break;
493   default:
494     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
495   }
496 
497   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
498   C->offloadmask = PETSC_OFFLOAD_GPU;
499   /* shorter version of MatAssemblyEnd_SeqAIJ */
500   c = (Mat_SeqAIJ*)C->data;
501   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
502   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
503   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
504   c->reallocs         = 0;
505   C->info.mallocs    += 0;
506   C->info.nz_unneeded = 0;
507   C->assembled = C->was_assembled = PETSC_TRUE;
508   C->num_ass++;
509   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
510   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
511   C->offloadmask = PETSC_OFFLOAD_BOTH;
512   // Also, we should add support to copy back from device to host
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
517 {
518   Mat_Product          *product = C->product;
519   Mat                  A,B;
520   MatProductType       ptype;
521   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
522   PetscInt             m,n,k;
523   bool                 tA,tB;
524   PetscErrorCode       ierr;
525   Mat_SeqAIJ           *c;
526   MatMatKernelHandle_t *kh;
527 
528   PetscFunctionBegin;
529   MatCheckProduct(C,1);
530   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
531   A = product->A;
532   B = product->B;
533   // TODO only copy the i,j data, not the values
534   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
535   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
536   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
538   ptype = product->type;
539   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
540   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
541   switch (ptype) {
542   case MATPRODUCT_AB:
543     tA = false;
544     tB = false;
545     m = A->rmap->n;
546     n = B->cmap->n;
547     break;
548   case MATPRODUCT_AtB:
549     tA = true;
550     tB = false;
551     m = A->cmap->n;
552     n = B->cmap->n;
553     break;
554   case MATPRODUCT_ABt:
555     tA = false;
556     tB = true;
557     m = A->rmap->n;
558     n = B->rmap->n;
559     break;
560   default:
561     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
562   }
563   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
564   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
565   c = (Mat_SeqAIJ*)C->data;
566 
567   kh = new MatMatKernelHandle_t;
568   // TODO SZ: ADD RUNTIME SELECTION OF THESE
569   kh->set_team_work_size(16);
570   kh->set_dynamic_scheduling(true);
571   // Select an spgemm algorithm, limited by configuration at compile-time and
572   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
573   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
574   std::string myalg("SPGEMM_KK_MEMORY");
575   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
576 
577   /////////////////////////////////////
578   // TODO JZ
579   ckok = NULL; //new Mat_SeqAIJKokkos();
580   C->spptr = ckok;
581   KokkosCsrMatrix_t ccsr; // here only to have the code compile
582   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
583   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
584   //c->nz = get_nnz_from_ccsr
585   //////////////////////////////////////
586   c->singlemalloc = PETSC_FALSE;
587   c->free_a       = PETSC_TRUE;
588   c->free_ij      = PETSC_TRUE;
589   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
590   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
591   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
592   ////////////////////////////////////
593   // TODO JZ copy from device to c->i and c->j
594   ////////////////////////////////////
595   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
596   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
597   c->maxnz = c->nz;
598   c->nonzerorowcnt = 0;
599   c->rmax = 0;
600   for (k = 0; k < m; k++) {
601     const PetscInt nn = c->i[k+1] - c->i[k];
602     c->ilen[k] = c->imax[k] = nn;
603     c->nonzerorowcnt += (PetscInt)!!nn;
604     c->rmax = PetscMax(c->rmax,nn);
605   }
606 
607   C->nonzerostate++;
608   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
609   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
610   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
611   ckok->nonzerostate = C->nonzerostate;
612   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
613   C->preallocated  = PETSC_TRUE;
614   C->assembled     = PETSC_FALSE;
615   C->was_assembled = PETSC_FALSE;
616 
617   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
618   C->product->data = kh;
619   C->product->destroy = MatMatKernelHandleDestroy_Private;
620   PetscFunctionReturn(0);
621 }
622 
623 /* handles sparse matrix matrix ops */
624 PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
625 {
626   Mat_Product    *product = mat->product;
627   PetscErrorCode ierr;
628   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
629 
630   PetscFunctionBegin;
631   MatCheckProduct(mat,1);
632   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
633   if (product->type == MATPRODUCT_ABC) {
634     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
635   }
636   if (Biskok && Ciskok) {
637     switch (product->type) {
638     case MATPRODUCT_AB:
639     case MATPRODUCT_AtB:
640     case MATPRODUCT_ABt:
641       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
642       break;
643     case MATPRODUCT_PtAP:
644     case MATPRODUCT_RARt:
645     case MATPRODUCT_ABC:
646       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
647       break;
648     default:
649       break;
650     }
651   } else { /* fallback for AIJ */
652     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
653   }
654   PetscFunctionReturn(0);
655 }
656 #endif
657 
658 static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
659 {
660   PetscErrorCode    ierr;
661   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
662   PetscFunctionBegin;
663   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
664   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
669 {
670   PetscErrorCode   ierr;
671   Mat_SeqAIJKokkos *aijkok;
672 
673   PetscFunctionBegin;
674   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
675   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
676   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
677   A->offloadmask = PETSC_OFFLOAD_GPU;
678   ierr = WaitForKokkos();CHKERRQ(ierr);
679   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
680   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
681   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
686 {
687   PetscErrorCode   ierr;
688   PetscBool        both = PETSC_FALSE;
689   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
690   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
691 
692   PetscFunctionBegin;
693   if (aijkok && aijkok->a_d.data()) {
694     KokkosBlas::fill(aijkok->a_d,0.0);
695     both = PETSC_TRUE;
696   }
697   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
698   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
699   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
700   else A->offloadmask = PETSC_OFFLOAD_CPU;
701   PetscFunctionReturn(0);
702 }
703 
704 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
705 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
706 {
707   PetscErrorCode     ierr;
708   Mat_SeqAIJKokkos   *aijkok;
709 
710   PetscFunctionBegin;
711   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
712   PetscValidPointer(kv,2);
713   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
714   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
715   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
716   *kv    = aijkok->a_d;
717   PetscFunctionReturn(0);
718 }
719 
720 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
721 {
722   PetscFunctionBegin;
723   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
724   PetscValidPointer(kv,2);
725   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
726   PetscFunctionReturn(0);
727 }
728 
729 PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
730 {
731   PetscErrorCode     ierr;
732   Mat_SeqAIJKokkos   *aijkok;
733 
734   PetscFunctionBegin;
735   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
736   PetscValidPointer(kv,2);
737   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
738   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
739   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
740   *kv    = aijkok->a_d;
741   PetscFunctionReturn(0);
742 }
743 
744 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
745 {
746   PetscErrorCode     ierr;
747 
748   PetscFunctionBegin;
749   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
750   PetscValidPointer(kv,2);
751   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
752   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
753   PetscFunctionReturn(0);
754 }
755 
756 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
757 {
758   Mat_SeqAIJKokkos   *aijkok;
759 
760   PetscFunctionBegin;
761   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
762   PetscValidPointer(kv,2);
763   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
764   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
765   *kv    = aijkok->a_d;
766   PetscFunctionReturn(0);
767 }
768 
769 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
770 {
771   PetscErrorCode     ierr;
772 
773   PetscFunctionBegin;
774   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
775   PetscValidPointer(kv,2);
776   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
777   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
778   PetscFunctionReturn(0);
779 }
780 
781 /* Computes Y += a*X */
782 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
783 {
784   PetscErrorCode ierr;
785   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
786 
787   PetscFunctionBegin;
788   if (X->ops->axpy != Y->ops->axpy) {
789     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
790     PetscFunctionReturn(0);
791   }
792 
793   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
794     PetscBool e;
795     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
796     if (e) {
797       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
798       if (e) str = SAME_NONZERO_PATTERN;
799     }
800   }
801 
802   if (str != SAME_NONZERO_PATTERN) {
803     /* TODO: do MatAXPY on device */
804     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
805     PetscFunctionReturn(0);
806   } else {
807     ConstPetscScalarKokkosView xv;
808     PetscScalarKokkosView      yv;
809 
810     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
811     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
812     KokkosBlas::axpy(a,xv,yv);
813     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
814     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
815   }
816   PetscFunctionReturn(0);
817 }
818 
819 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
820 {
821   PetscErrorCode ierr;
822 
823   PetscFunctionBegin;
824   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
825   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
826   B->offloadmask = PETSC_OFFLOAD_CPU;
827   PetscFunctionReturn(0);
828 }
829 
830 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
831 {
832   PetscFunctionBegin;
833   A->boundtocpu = PETSC_FALSE;
834 
835   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
836   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
837   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
838   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
839   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
840   A->ops->scale                     = MatScale_SeqAIJKokkos;
841   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
842   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
843   A->ops->mult                      = MatMult_SeqAIJKokkos;
844   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
845   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
846   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
847   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
848   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
849   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
850   PetscFunctionReturn(0);
851 }
852 
853 /* --------------------------------------------------------------------------------*/
854 /*@C
855    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
856    (the default parallel PETSc format). This matrix will ultimately be handled by
857    Kokkos for calculations. For good matrix
858    assembly performance the user should preallocate the matrix storage by setting
859    the parameter nz (or the array nnz).  By setting these parameters accurately,
860    performance during matrix assembly can be increased by more than a factor of 50.
861 
862    Collective
863 
864    Input Parameters:
865 +  comm - MPI communicator, set to PETSC_COMM_SELF
866 .  m - number of rows
867 .  n - number of columns
868 .  nz - number of nonzeros per row (same for all rows)
869 -  nnz - array containing the number of nonzeros in the various rows
870          (possibly different for each row) or NULL
871 
872    Output Parameter:
873 .  A - the matrix
874 
875    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
876    MatXXXXSetPreallocation() paradgm instead of this routine directly.
877    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
878 
879    Notes:
880    If nnz is given then nz is ignored
881 
882    The AIJ format (also called the Yale sparse matrix format or
883    compressed row storage), is fully compatible with standard Fortran 77
884    storage.  That is, the stored row and column indices can begin at
885    either one (as in Fortran) or zero.  See the users' manual for details.
886 
887    Specify the preallocated storage with either nz or nnz (not both).
888    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
889    allocation.  For large problems you MUST preallocate memory or you
890    will get TERRIBLE performance, see the users' manual chapter on matrices.
891 
892    By default, this format uses inodes (identical nodes) when possible, to
893    improve numerical efficiency of matrix-vector products and solves. We
894    search for consecutive rows with the same nonzero structure, thereby
895    reusing matrix information to achieve increased efficiency.
896 
897    Level: intermediate
898 
899 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
900 @*/
901 PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
902 {
903   PetscErrorCode ierr;
904 
905   PetscFunctionBegin;
906   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
907   ierr = MatCreate(comm,A);CHKERRQ(ierr);
908   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
909   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
910   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
911   PetscFunctionReturn(0);
912 }
913 
914 // factorizations
915 typedef Kokkos::TeamPolicy<>::member_type team_member;
916 //
917 // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
918 // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
919 //
920 static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
921 {
922   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
923   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
924   IS                 isrow = b->row,isicol = b->icol;
925   PetscErrorCode     ierr;
926   const PetscInt     *r_h,*ic_h;
927   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
928   const PetscScalar  *aa_d = aijkok->a_d.data();
929   PetscScalar        *ba_d = baijkok->a_d.data();
930   PetscBool          row_identity,col_identity;
931   PetscInt           nc, Nf, nVec=32; // should be a parameter
932   PetscContainer     container;
933 
934   PetscFunctionBegin;
935   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
936   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
937   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
938   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
939   if (container) {
940     PetscInt *pNf=NULL, nv;
941     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
942     Nf = (*pNf)%1000;
943     nv = (*pNf)/1000;
944     if (nv>0) nVec = nv;
945   } else Nf = 1;
946   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
947   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
948   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
949   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
950 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
951   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
952 #endif
953   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
954   {
955 #define KOKKOS_SHARED_LEVEL 1
956     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
957     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
958     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
959     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
960     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
961     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
962     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
963     size_t flops_h = 0.0;
964     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
965     Kokkos::View<size_t> d_flops_k ("flops");
966     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
967     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
968     Kokkos::deep_copy (d_flops_k, h_flops_k);
969     Kokkos::deep_copy (d_r_k, h_r_k);
970     Kokkos::deep_copy (d_ic_k, h_ic_k);
971     // Fill A --> fact
972     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
973         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
974         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
975         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
976         // zero rows of B
977         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
978             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
979             PetscScalar *baL = ba_d + bi_d[rowb];
980             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
981             /* zero (unfactored row) */
982             for (int j=0;j<nzbL;j++) baL[j] = 0;
983             for (int j=0;j<nzbU;j++) baU[j] = 0;
984           });
985         // copy A into B
986         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
987             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
988             const PetscScalar *av    = aa_d + ai_d[rowa];
989             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
990             /* load in initial (unfactored row) */
991             for (int j=0;j<nza;j++) {
992               PetscInt    colb = ic[ajtmp[j]];
993               PetscScalar vala = av[j];
994               if (colb == rowb) {
995                 *(ba_d + bdiag_d[rowb]) = vala;
996               } else {
997                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
998                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
999                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
1000                 for (int j=0; j<nz ; j++) {
1001                   if (pbj[j] == colb) {
1002                     pba[j] = vala;
1003                     set++;
1004                     break;
1005                   }
1006                 }
1007                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1008               }
1009             }
1010           });
1011       });
1012     Kokkos::fence();
1013 
1014     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
1015         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1016         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
1017         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1018         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
1019         const PetscInt  start = field*nloc, end = start + nloc;
1020         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
1021         // A22 panel update for each row A(1,:) and col A(:,1)
1022         for (int ii=start; ii<end-1; ii++) {
1023           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
1024           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
1025           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
1026           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
1027           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
1028               PetscInt kIdx = j*Ni + field_block_idx;
1029               if (kIdx >= nzUi) /* void */ ;
1030               else {
1031                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
1032                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
1033                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
1034                 size_t         st_idx;
1035                 // find and do L(k,i) = A(:k,i) / A(i,i)
1036                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
1037                 // get column, there has got to be a better way
1038                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
1039                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
1040                     if (pjL[j] == ii) {
1041                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
1042                       idx = j; // output
1043                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
1044                     }
1045                   }, st_idx);
1046                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1047                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
1048                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
1049                   // U(i+1,:end)
1050                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
1051                       PetscScalar Uij = baUi[uiIdx];
1052                       PetscInt    col = bjUi[uiIdx];
1053                       if (col==myk) {
1054                         // A_kk = A_kk - L_ki * U_ij(k)
1055                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
1056                         *Akkv = *Akkv - L_ki() * Uij; // UiK
1057                       } else {
1058                         PetscScalar    *start, *end, *pAkjv=NULL;
1059                         PetscInt       high, low;
1060                         const PetscInt *startj;
1061                         if (col<myk) { // L
1062                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
1063                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
1064                           start = pLki+1; // start at pLki+1, A22(myk,1)
1065                           startj= bj_d + bi_d[myk] + idx;
1066                           end   = ba_d + bi_d[myk+1];
1067                         } else {
1068                           PetscInt idx = bdiag_d[myk+1]+1;
1069                           start = ba_d + idx;
1070                           startj= bj_d + idx;
1071                           end   = ba_d + bdiag_d[myk];
1072                         }
1073                         // search for 'col', use bisection search - TODO
1074                         low  = 0;
1075                         high = (PetscInt)(end-start);
1076                         while (high-low > 5) {
1077                           int t = (low+high)/2;
1078                           if (startj[t] > col) high = t;
1079                           else                 low  = t;
1080                         }
1081                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
1082                           if (startj[pAkjv-start] == col) break;
1083                         }
1084                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
1085                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
1086                       }
1087                     });
1088                 }
1089               }
1090             });
1091           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
1092           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
1093         } /* endof for (i=0; i<n; i++) { */
1094         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
1095       });
1096     Kokkos::fence();
1097     Kokkos::deep_copy (h_flops_k, d_flops_k);
1098 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1099     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1100 #elif  defined(PETSC_USE_LOG)
1101     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
1102 #endif
1103     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
1104         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
1105         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
1106         /* Invert diagonal for simpler triangular solves */
1107         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
1108             int i = start + outer_index*Ni + lg_rank%Ni;
1109             if (i < end) {
1110               PetscScalar *pv = ba_d + bdiag_d[i];
1111               *pv = 1.0/(*pv);
1112             }
1113           });
1114       });
1115   }
1116 #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
1117   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1118 #endif
1119   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
1120   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
1121 
1122   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1123   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
1124   if (b->inode.size) {
1125     B->ops->solve = MatSolve_SeqAIJ_Inode;
1126   } else if (row_identity && col_identity) {
1127     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
1128   } else {
1129     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
1130   }
1131   B->offloadmask = PETSC_OFFLOAD_GPU;
1132   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
1133   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
1134   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1135   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1136   B->ops->matsolve          = MatMatSolve_SeqAIJ;
1137   B->assembled              = PETSC_TRUE;
1138   B->preallocated           = PETSC_TRUE;
1139 
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1144 {
1145   PetscErrorCode   ierr;
1146 
1147   PetscFunctionBegin;
1148   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1149   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1154 {
1155   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1156 
1157   PetscFunctionBegin;
1158   if (!factors->sptrsv_symbolic_completed) {
1159     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1160     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1161     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1162   }
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 /* Check if we need to update factors etc for transpose solve */
1167 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1168 {
1169   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1170   MatColumnIndexType             n = A->rmap->n;
1171 
1172   PetscFunctionBegin;
1173   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1174     /* Update L^T and do sptrsv symbolic */
1175     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
1176     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1177     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1178     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1179 
1180     KokkosKernels::Impl::transpose_matrix<
1181       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1182       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1183       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1184         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1185         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1186 
1187     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1188       We have to sort the indices, until KK provides finer control options.
1189     */
1190     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1191       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1192         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1193 
1194     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1195 
1196     /* Update U^T and do sptrsv symbolic */
1197     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
1198     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1199     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1200     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1201 
1202     KokkosKernels::Impl::transpose_matrix<
1203       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1204       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1205       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1206         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1207         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1208 
1209     /* Sort indices. See comments above */
1210     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1211       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1212         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1213 
1214     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1215     factors->transpose_updated = PETSC_TRUE;
1216   }
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 /* Solve Ax = b, with A = LU */
1221 static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1222 {
1223   PetscErrorCode                 ierr;
1224   ConstPetscScalarKokkosView     bv;
1225   PetscScalarKokkosView          xv;
1226   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1227 
1228   PetscFunctionBegin;
1229   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1230   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1231   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1232   /* Solve L tmpv = b */
1233   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
1234   /* Solve Ux = tmpv */
1235   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
1236   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1237   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 /* Solve A^T x = b, with A^T = U^T L^T */
1242 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1243 {
1244   PetscErrorCode                 ierr;
1245   ConstPetscScalarKokkosView     bv;
1246   PetscScalarKokkosView          xv;
1247   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1248 
1249   PetscFunctionBegin;
1250   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1251   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1252   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1253   /* Solve U^T tmpv = b */
1254   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1255 
1256   /* Solve L^T x = tmpv */
1257   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1258   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1259   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1264 {
1265   PetscErrorCode                 ierr;
1266   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1267   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1268   PetscInt                       fill_lev = info->levels;
1269 
1270   PetscFunctionBegin;
1271   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1272   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1273 
1274   B->assembled                       = PETSC_TRUE;
1275   B->preallocated                    = PETSC_TRUE;
1276   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1277   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1278   B->ops->matsolve                   = NULL;
1279   B->ops->matsolvetranspose          = NULL;
1280   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1281 
1282   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1283   factors->transpose_updated         = PETSC_FALSE;
1284   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1285   PetscFunctionReturn(0);
1286 }
1287 
1288 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1289 {
1290   PetscErrorCode                 ierr;
1291   Mat_SeqAIJKokkos               *aijkok;
1292   Mat_SeqAIJ                     *b;
1293   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1294   PetscInt                       fill_lev = info->levels;
1295   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1296   PetscInt                       n = A->rmap->n;
1297 
1298   PetscFunctionBegin;
1299   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1300   /* Rebuild factors */
1301   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1302   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1303 
1304   /* Create a spiluk handle and then do symbolic factorization */
1305   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1306   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1307 
1308   auto spiluk_handle = factors->kh.get_spiluk_handle();
1309 
1310   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1311   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1312   Kokkos::realloc(factors->iU_d,n+1);
1313   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1314 
1315   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1316   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1317   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1318 
1319   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1320   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1321   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1322   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1323 
1324   /* TODO: add options to select sptrsv algorithms */
1325   /* Create sptrsv handles for L, U and their transpose */
1326  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1327   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1328  #else
1329   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1330  #endif
1331 
1332   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1333   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1334   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1335   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1336 
1337   /* Fill fields of the factor matrix B */
1338   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1339   b     = (Mat_SeqAIJ*)B->data;
1340   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1341   B->info.fill_ratio_given  = info->fill;
1342   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1343 
1344   B->offloadmask            = PETSC_OFFLOAD_GPU;
1345   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1350 {
1351   PetscErrorCode   ierr;
1352   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
1353   const PetscInt   nrows   = A->rmap->n;
1354 
1355   PetscFunctionBegin;
1356   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1357   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
1358   // move B data into Kokkos
1359   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
1360   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
1361   {
1362     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1363     if (!baijkok->diag_d) {
1364       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1365       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1366       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
1367     }
1368   }
1369   PetscFunctionReturn(0);
1370 }
1371 
1372 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1373 {
1374   PetscFunctionBegin;
1375   *type = MATSOLVERKOKKOS;
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 // use -pc_factor_mat_solver_type kokkosdevice
1380 static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
1381 {
1382   PetscFunctionBegin;
1383   *type = MATSOLVERKOKKOSDEVICE;
1384   PetscFunctionReturn(0);
1385 }
1386 
1387 /*MC
1388   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1389   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1390 
1391   Level: beginner
1392 
1393 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1394 M*/
1395 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1396 {
1397   PetscErrorCode ierr;
1398   PetscInt       n = A->rmap->n;
1399 
1400   PetscFunctionBegin;
1401   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1402   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1403   (*B)->factortype = ftype;
1404   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1405   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1406 
1407   if (ftype == MAT_FACTOR_LU) {
1408     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1409     (*B)->canuseordering         = PETSC_TRUE;
1410     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1411   } else if (ftype == MAT_FACTOR_ILU) {
1412     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1413     (*B)->canuseordering         = PETSC_FALSE;
1414     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1415   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1416 
1417   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1418   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1423 {
1424   PetscErrorCode ierr;
1425   PetscInt       n = A->rmap->n;
1426 
1427   PetscFunctionBegin;
1428   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1429   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1430   (*B)->factortype = ftype;
1431   (*B)->canuseordering = PETSC_TRUE;
1432   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1433   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1434 
1435   if (ftype == MAT_FACTOR_LU) {
1436     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1437     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1438   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1439 
1440   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1441   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1442   PetscFunctionReturn(0);
1443 }
1444 
1445 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1446 {
1447   PetscErrorCode ierr;
1448 
1449   PetscFunctionBegin;
1450   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1451   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1452   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1453   PetscFunctionReturn(0);
1454 }
1455 
1456