xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f73b041577f311cd4e522d727240702cd4268ffe)
1152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
4152b3e56SJunchao Zhang #include <petscvec_kokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
118c3ff71bSJunchao Zhang 
128c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13a587d139SMark #include <petscmat.h>
148c3ff71bSJunchao Zhang 
158c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
168c3ff71bSJunchao Zhang 
178c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
188c3ff71bSJunchao Zhang {
198c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
20a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
218c3ff71bSJunchao Zhang 
228c3ff71bSJunchao Zhang   PetscFunctionBegin;
238c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
24a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
25a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
26a587d139SMark   }
278c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
298c3ff71bSJunchao Zhang }
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
328c3ff71bSJunchao Zhang {
33152b3e56SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
348c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
358c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
368c3ff71bSJunchao Zhang 
378c3ff71bSJunchao Zhang   PetscFunctionBegin;
38152b3e56SJunchao Zhang   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
398c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
408c3ff71bSJunchao Zhang     delete aijkok;
418c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
428c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
438c3ff71bSJunchao Zhang     A->spptr             = aijkok;
448c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
45152b3e56SJunchao Zhang     // Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
46152b3e56SJunchao Zhang     aijkok->a_dual.modify_host(); /* Mark host is modified */
47152b3e56SJunchao Zhang     aijkok->a_dual.sync_device(); /* Sync the device */
48152b3e56SJunchao Zhang     aijkok->need_sync_At_values = PETSC_TRUE; /* Mark matT and matH need to sync, but don't sync them now */
49152b3e56SJunchao Zhang     aijkok->need_sync_Ah_values = PETSC_TRUE;
508c3ff71bSJunchao Zhang   }
518c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
55f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
56f0cf5187SStefano Zampini {
57f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
58f0cf5187SStefano Zampini 
59f0cf5187SStefano Zampini   PetscFunctionBegin;
60f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
61f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
62f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
63f0cf5187SStefano Zampini     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
64f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
65f0cf5187SStefano Zampini   }
66f0cf5187SStefano Zampini   PetscFunctionReturn(0);
67f0cf5187SStefano Zampini }
68f0cf5187SStefano Zampini 
69f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
70f0cf5187SStefano Zampini {
71f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
72f0cf5187SStefano Zampini   PetscErrorCode ierr;
73f0cf5187SStefano Zampini 
74f0cf5187SStefano Zampini   PetscFunctionBegin;
75f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
76f0cf5187SStefano Zampini   *array = a->a;
77f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
78f0cf5187SStefano Zampini   PetscFunctionReturn(0);
79f0cf5187SStefano Zampini }
80f0cf5187SStefano Zampini 
81a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
82a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
83a587d139SMark {
84a587d139SMark   Mat_SeqAIJKokkos *aijkok;
85a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
86a587d139SMark 
87a587d139SMark   PetscFunctionBegin;
88a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
89a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
90a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
91152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
92a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
93a587d139SMark   PetscFunctionReturn(0);
94a587d139SMark }
95a587d139SMark 
96a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
97a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
98a587d139SMark {
99a587d139SMark   Mat_SeqAIJKokkos *aijkok;
100a587d139SMark 
101a587d139SMark   PetscFunctionBegin;
102a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
103a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
104a587d139SMark     *d_mat = aijkok->device_mat_d.data();
105a587d139SMark   } else {
106a587d139SMark     PetscErrorCode   ierr;
107a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
108a587d139SMark     *d_mat  = NULL;
109a587d139SMark   }
110a587d139SMark   PetscFunctionReturn(0);
111a587d139SMark }
112152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
113152b3e56SJunchao Zhang {
114152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
115152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116152b3e56SJunchao Zhang 
117152b3e56SJunchao Zhang   PetscFunctionBegin;
118152b3e56SJunchao Zhang   if (!aijkok->At) { /* Generate At for the first time */
119152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
120152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
121152b3e56SJunchao Zhang   } else if (aijkok->need_sync_At_values) { /* Only update At values */
122152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
123152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
124152b3e56SJunchao Zhang   }
125152b3e56SJunchao Zhang   aijkok->need_sync_At_values = PETSC_FALSE;
126152b3e56SJunchao Zhang   PetscFunctionReturn(0);
127152b3e56SJunchao Zhang }
128152b3e56SJunchao Zhang 
129152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeHermitian(Mat A)
130152b3e56SJunchao Zhang {
131152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
132152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
133152b3e56SJunchao Zhang 
134152b3e56SJunchao Zhang   PetscFunctionBegin;
135152b3e56SJunchao Zhang   if (!aijkok->Ah) { /* Generate Ah for the first time */
136152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
137152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
138152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
139152b3e56SJunchao Zhang   } else if (aijkok->need_sync_Ah_values) { /* Only update Ah values */
140152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
141152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
142152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
143152b3e56SJunchao Zhang   }
144152b3e56SJunchao Zhang   aijkok->need_sync_Ah_values = PETSC_FALSE;
145152b3e56SJunchao Zhang   PetscFunctionReturn(0);
146152b3e56SJunchao Zhang }
147a587d139SMark 
1488c3ff71bSJunchao Zhang /* y = A x */
1498c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1508c3ff71bSJunchao Zhang {
1518c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1528c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
153152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
154152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
1558c3ff71bSJunchao Zhang 
1568c3ff71bSJunchao Zhang   PetscFunctionBegin;
1578c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
158152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
159152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
1608c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
161152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
162152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
163152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
164bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
165152b3e56SJunchao Zhang   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
166152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
1678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1688c3ff71bSJunchao Zhang }
1698c3ff71bSJunchao Zhang 
1708c3ff71bSJunchao Zhang /* y = A^T x */
1718c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1728c3ff71bSJunchao Zhang {
1738c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1748c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
175152b3e56SJunchao Zhang   Mat                              B;
176152b3e56SJunchao Zhang   const char                       *mode;
177152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
178152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
1798c3ff71bSJunchao Zhang 
1808c3ff71bSJunchao Zhang   PetscFunctionBegin;
1818c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
182152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
183152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
184152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
185152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
186152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
187152b3e56SJunchao Zhang     mode = "N";
188152b3e56SJunchao Zhang   } else {
189152b3e56SJunchao Zhang     B    = A;
190152b3e56SJunchao Zhang     mode = "T";
191152b3e56SJunchao Zhang   }
192152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
193152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
194152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
195152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
196bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
197152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
1988c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1998c3ff71bSJunchao Zhang }
2008c3ff71bSJunchao Zhang 
2018c3ff71bSJunchao Zhang /* y = A^H x */
2028c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2038c3ff71bSJunchao Zhang {
2048c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2058c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
206152b3e56SJunchao Zhang   Mat                              B;
207152b3e56SJunchao Zhang   const char                       *mode;
208152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
209152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2108c3ff71bSJunchao Zhang 
2118c3ff71bSJunchao Zhang   PetscFunctionBegin;
2128c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
213152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
214152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
215152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
216152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr);
217152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
218152b3e56SJunchao Zhang     mode = "N";
219152b3e56SJunchao Zhang   } else {
220152b3e56SJunchao Zhang     B    = A;
221152b3e56SJunchao Zhang     mode = "C";
222152b3e56SJunchao Zhang   }
223152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
224152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
225152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
226152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
227bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
228152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2298c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2308c3ff71bSJunchao Zhang }
2318c3ff71bSJunchao Zhang 
2328c3ff71bSJunchao Zhang /* z = A x + y */
2338c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
2348c3ff71bSJunchao Zhang {
2358c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2368c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
237152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
238152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2398c3ff71bSJunchao Zhang 
2408c3ff71bSJunchao Zhang   PetscFunctionBegin;
2418c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
242152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
243152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
244152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2458c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2468c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
247152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
248152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
249152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
250152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
251bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
252152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2548c3ff71bSJunchao Zhang }
2558c3ff71bSJunchao Zhang 
2568c3ff71bSJunchao Zhang /* z = A^T x + y */
2578c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2588c3ff71bSJunchao Zhang {
2598c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2608c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
261152b3e56SJunchao Zhang   Mat                              B;
262152b3e56SJunchao Zhang   const char                       *mode;
263152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
264152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2658c3ff71bSJunchao Zhang 
2668c3ff71bSJunchao Zhang   PetscFunctionBegin;
2678c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
268152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
269152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
270152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2718c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
272152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
273152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
274152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
275152b3e56SJunchao Zhang     mode = "N";
276152b3e56SJunchao Zhang   } else {
277152b3e56SJunchao Zhang     B    = A;
278152b3e56SJunchao Zhang     mode = "T";
279152b3e56SJunchao Zhang   }
280152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
281152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
282152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
283152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
284152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
285bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
286152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2878c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2888c3ff71bSJunchao Zhang }
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang /* z = A^H x + y */
2918c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2928c3ff71bSJunchao Zhang {
2938c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2948c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
295152b3e56SJunchao Zhang   Mat                              B;
296152b3e56SJunchao Zhang   const char                       *mode;
297152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
298152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2998c3ff71bSJunchao Zhang 
3008c3ff71bSJunchao Zhang   PetscFunctionBegin;
3018c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
302152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
303152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3058c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
306152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
307152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTransposeHermitian(A);CHKERRQ(ierr);
308152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
309152b3e56SJunchao Zhang     mode = "N";
310152b3e56SJunchao Zhang   } else {
311152b3e56SJunchao Zhang     B    = A;
312152b3e56SJunchao Zhang     mode = "C";
313152b3e56SJunchao Zhang   }
314152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
315152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
316152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
317152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
318152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
319bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
320152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
321152b3e56SJunchao Zhang   PetscFunctionReturn(0);
322152b3e56SJunchao Zhang }
323152b3e56SJunchao Zhang 
324152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
325152b3e56SJunchao Zhang {
326152b3e56SJunchao Zhang   PetscErrorCode            ierr;
327152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
328152b3e56SJunchao Zhang 
329152b3e56SJunchao Zhang   PetscFunctionBegin;
330152b3e56SJunchao Zhang   switch (op) {
331152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
332152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
333152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
334152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
335152b3e56SJunchao Zhang       break;
336152b3e56SJunchao Zhang     default:
337152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
338152b3e56SJunchao Zhang       break;
339152b3e56SJunchao Zhang   }
3408c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3418c3ff71bSJunchao Zhang }
3428c3ff71bSJunchao Zhang 
3433d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3448c3ff71bSJunchao Zhang {
3458c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3468c3ff71bSJunchao Zhang   Mat            B;
347c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
3488c3ff71bSJunchao Zhang 
3498c3ff71bSJunchao Zhang   PetscFunctionBegin;
350a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
3518c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
3528c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3538c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
3548c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3558c3ff71bSJunchao Zhang   }
3568c3ff71bSJunchao Zhang 
3578c3ff71bSJunchao Zhang   B    = *newmat;
3588c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3598c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
3608c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
3610d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
362f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
363c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
364c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
365c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
3668c3ff71bSJunchao Zhang 
3678c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
3688c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3698c3ff71bSJunchao Zhang }
3708c3ff71bSJunchao Zhang 
3718c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
3728c3ff71bSJunchao Zhang {
3738c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3748c3ff71bSJunchao Zhang 
3758c3ff71bSJunchao Zhang   PetscFunctionBegin;
3768c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3778c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3788c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3798c3ff71bSJunchao Zhang }
3808c3ff71bSJunchao Zhang 
3818c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
3828c3ff71bSJunchao Zhang {
3838c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
3848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
3858c3ff71bSJunchao Zhang 
3868c3ff71bSJunchao Zhang   PetscFunctionBegin;
3878f7e8f9dSMark Adams   if (aijkok) {
3888f7e8f9dSMark Adams     if (aijkok->device_mat_d.data()) {
389a587d139SMark       delete aijkok->colmap_d;
390a587d139SMark       delete aijkok->i_uncompressed_d;
391a587d139SMark     }
3928f7e8f9dSMark Adams     if (aijkok->diag_d) delete aijkok->diag_d;
3938f7e8f9dSMark Adams   }
3948c3ff71bSJunchao Zhang   delete aijkok;
395f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
396152b3e56SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
397152b3e56SJunchao Zhang   A->spptr = NULL;
3988c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3998c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4008c3ff71bSJunchao Zhang }
4018c3ff71bSJunchao Zhang 
402a3f881fbSStefano Zampini #if 0
403a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
404a3f881fbSStefano Zampini {
405a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
406a3f881fbSStefano Zampini 
407a3f881fbSStefano Zampini   PetscFunctionBegin;
408a3f881fbSStefano Zampini   delete kh;
409a3f881fbSStefano Zampini   PetscFunctionReturn(0);
410a3f881fbSStefano Zampini }
411a3f881fbSStefano Zampini 
412a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
413a3f881fbSStefano Zampini {
414a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
415a3f881fbSStefano Zampini   Mat                  A,B;
416a3f881fbSStefano Zampini   MatProductType       ptype;
417a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
418a3f881fbSStefano Zampini   bool                 tA,tB;
419a3f881fbSStefano Zampini   PetscErrorCode       ierr;
420a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
421a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
422a3f881fbSStefano Zampini 
423a3f881fbSStefano Zampini   PetscFunctionBegin;
424a3f881fbSStefano Zampini   MatCheckProduct(C,1);
425a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
426a3f881fbSStefano Zampini   A = product->A;
427a3f881fbSStefano Zampini   B = product->B;
428a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
429a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
430a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
431a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
432a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
433a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
434a3f881fbSStefano Zampini   ptype = product->type;
435a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
436a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
437a3f881fbSStefano Zampini   switch (ptype) {
438a3f881fbSStefano Zampini   case MATPRODUCT_AB:
439a3f881fbSStefano Zampini     tA = false;
440a3f881fbSStefano Zampini     tB = false;
441a3f881fbSStefano Zampini     break;
442a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
443a3f881fbSStefano Zampini     tA = true;
444a3f881fbSStefano Zampini     tB = false;
445a3f881fbSStefano Zampini     break;
446a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
447a3f881fbSStefano Zampini     tA = false;
448a3f881fbSStefano Zampini     tB = true;
449a3f881fbSStefano Zampini     break;
450a3f881fbSStefano Zampini   default:
451a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
452a3f881fbSStefano Zampini   }
453a3f881fbSStefano Zampini 
454a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
455a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
456a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
457a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
458a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
459a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
460a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
461a3f881fbSStefano Zampini   c->reallocs         = 0;
462a3f881fbSStefano Zampini   C->info.mallocs    += 0;
463a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
464a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
465a3f881fbSStefano Zampini   C->num_ass++;
466a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
467a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
468a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
469a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
470a3f881fbSStefano Zampini   PetscFunctionReturn(0);
471a3f881fbSStefano Zampini }
472a3f881fbSStefano Zampini 
473a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
474a3f881fbSStefano Zampini {
475a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
476a3f881fbSStefano Zampini   Mat                  A,B;
477a3f881fbSStefano Zampini   MatProductType       ptype;
478a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
479a3f881fbSStefano Zampini   PetscInt             m,n,k;
480a3f881fbSStefano Zampini   bool                 tA,tB;
481a3f881fbSStefano Zampini   PetscErrorCode       ierr;
482a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
483a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
484a3f881fbSStefano Zampini 
485a3f881fbSStefano Zampini   PetscFunctionBegin;
486a3f881fbSStefano Zampini   MatCheckProduct(C,1);
487a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
488a3f881fbSStefano Zampini   A = product->A;
489a3f881fbSStefano Zampini   B = product->B;
490a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
491a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
492a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
493a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
494a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
495a3f881fbSStefano Zampini   ptype = product->type;
496a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
497a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
498a3f881fbSStefano Zampini   switch (ptype) {
499a3f881fbSStefano Zampini   case MATPRODUCT_AB:
500a3f881fbSStefano Zampini     tA = false;
501a3f881fbSStefano Zampini     tB = false;
502a3f881fbSStefano Zampini     m = A->rmap->n;
503a3f881fbSStefano Zampini     n = B->cmap->n;
504a3f881fbSStefano Zampini     break;
505a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
506a3f881fbSStefano Zampini     tA = true;
507a3f881fbSStefano Zampini     tB = false;
508a3f881fbSStefano Zampini     m = A->cmap->n;
509a3f881fbSStefano Zampini     n = B->cmap->n;
510a3f881fbSStefano Zampini     break;
511a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
512a3f881fbSStefano Zampini     tA = false;
513a3f881fbSStefano Zampini     tB = true;
514a3f881fbSStefano Zampini     m = A->rmap->n;
515a3f881fbSStefano Zampini     n = B->rmap->n;
516a3f881fbSStefano Zampini     break;
517a3f881fbSStefano Zampini   default:
518a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
519a3f881fbSStefano Zampini   }
520a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
521a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
522a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
523a3f881fbSStefano Zampini 
524a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
525a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
526a3f881fbSStefano Zampini   kh->set_team_work_size(16);
527a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
528a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
529a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
530a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
531a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
532a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
533a3f881fbSStefano Zampini 
534a3f881fbSStefano Zampini   /////////////////////////////////////
535a3f881fbSStefano Zampini   // TODO JZ
536a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
537a3f881fbSStefano Zampini   C->spptr = ckok;
538a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
539a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
540a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
541a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
542a3f881fbSStefano Zampini   //////////////////////////////////////
543a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
544a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
545a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
546a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
547a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
548a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
549a3f881fbSStefano Zampini   ////////////////////////////////////
550a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
551a3f881fbSStefano Zampini   ////////////////////////////////////
552a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
553a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
554a3f881fbSStefano Zampini   c->maxnz = c->nz;
555a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
556a3f881fbSStefano Zampini   c->rmax = 0;
557a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
558a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
559a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
560a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
561a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
562a3f881fbSStefano Zampini   }
563a3f881fbSStefano Zampini 
564a3f881fbSStefano Zampini   C->nonzerostate++;
565a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
566a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
567a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
568a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
569a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
570a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
571a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
572a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
573a3f881fbSStefano Zampini 
574a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
575a3f881fbSStefano Zampini   C->product->data = kh;
576a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
577a3f881fbSStefano Zampini   PetscFunctionReturn(0);
578a3f881fbSStefano Zampini }
579a3f881fbSStefano Zampini 
580a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
581a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
582a3f881fbSStefano Zampini {
583a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
584a3f881fbSStefano Zampini   PetscErrorCode ierr;
585a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
586a3f881fbSStefano Zampini 
587a3f881fbSStefano Zampini   PetscFunctionBegin;
588a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
589a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
590a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
591a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
592a3f881fbSStefano Zampini   }
593a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
594a3f881fbSStefano Zampini     switch (product->type) {
595a3f881fbSStefano Zampini     case MATPRODUCT_AB:
596a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
597a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
598a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
599a3f881fbSStefano Zampini       break;
600a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
601a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
602a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
603a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
604a3f881fbSStefano Zampini       break;
605a3f881fbSStefano Zampini     default:
606a3f881fbSStefano Zampini       break;
607a3f881fbSStefano Zampini     }
608a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
609a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
610a3f881fbSStefano Zampini   }
611a3f881fbSStefano Zampini   PetscFunctionReturn(0);
612a3f881fbSStefano Zampini }
613a3f881fbSStefano Zampini #endif
614a3f881fbSStefano Zampini 
6158c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
6168c3ff71bSJunchao Zhang {
6178c3ff71bSJunchao Zhang   PetscErrorCode ierr;
6188c3ff71bSJunchao Zhang 
6198c3ff71bSJunchao Zhang   PetscFunctionBegin;
6208c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
6218c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
6228c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
6238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6248c3ff71bSJunchao Zhang }
6258c3ff71bSJunchao Zhang 
626a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
627a587d139SMark {
628a587d139SMark   PetscErrorCode    ierr;
629a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
630a587d139SMark   PetscFunctionBegin;
631a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
632a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
633a587d139SMark   }
634a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
635a587d139SMark   PetscFunctionReturn(0);
636a587d139SMark }
637a587d139SMark 
638f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
639f0cf5187SStefano Zampini {
640f0cf5187SStefano Zampini   PetscErrorCode   ierr;
641f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
642f0cf5187SStefano Zampini 
643f0cf5187SStefano Zampini   PetscFunctionBegin;
644f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
645f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
646f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
647f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
648f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
649f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
650f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
651f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
652f0cf5187SStefano Zampini   PetscFunctionReturn(0);
653f0cf5187SStefano Zampini }
654f0cf5187SStefano Zampini 
655a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
656a587d139SMark {
657a587d139SMark   PetscErrorCode   ierr;
658a587d139SMark   PetscBool        both = PETSC_FALSE;
659a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
660a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
661a587d139SMark 
662a587d139SMark   PetscFunctionBegin;
663a587d139SMark   if (aijkok && aijkok->a_d.data()) {
664f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
665a587d139SMark     both = PETSC_TRUE;
666a587d139SMark   }
667a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
668a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
669a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
670a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
671a587d139SMark   PetscFunctionReturn(0);
672a587d139SMark }
673a587d139SMark 
674f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
675a587d139SMark {
676a587d139SMark   PetscErrorCode ierr;
677a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
678a587d139SMark 
679a587d139SMark   PetscFunctionBegin;
680f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
681a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
682a587d139SMark     PetscFunctionReturn(0);
683a587d139SMark   }
684f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
685a587d139SMark     PetscBool e;
686a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
687a587d139SMark     if (e) {
688a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
689f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
690a587d139SMark     }
691a587d139SMark   }
692a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
693a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
694a587d139SMark     PetscFunctionReturn(0);
695a587d139SMark   } else {
696a587d139SMark     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
697a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
698a587d139SMark       PetscFunctionReturn(0);
699f0cf5187SStefano Zampini     } else {
700a587d139SMark       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
701f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
702a587d139SMark     }
703a587d139SMark     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
704a587d139SMark     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
705a587d139SMark     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
706f0cf5187SStefano Zampini       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
707f0cf5187SStefano Zampini       Y->offloadmask = PETSC_OFFLOAD_GPU;
708f0cf5187SStefano Zampini       ierr = WaitForKokkos();CHKERRQ(ierr);
709a587d139SMark       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
710f0cf5187SStefano Zampini       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
711f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
712a587d139SMark     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
713a587d139SMark   }
714a587d139SMark   PetscFunctionReturn(0);
715a587d139SMark }
716a587d139SMark 
7178f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
7188f7e8f9dSMark Adams {
7198f7e8f9dSMark Adams   PetscErrorCode ierr;
7208f7e8f9dSMark Adams 
7218f7e8f9dSMark Adams   PetscFunctionBegin;
7228f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
7238f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
7248f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
7258f7e8f9dSMark Adams   PetscFunctionReturn(0);
7268f7e8f9dSMark Adams }
7278f7e8f9dSMark Adams 
7288c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
7298c3ff71bSJunchao Zhang {
7308c3ff71bSJunchao Zhang   PetscFunctionBegin;
7316f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
7326f3d89d0SStefano Zampini 
733a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
7348c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
7358c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
7368c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
737a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
738f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
739a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
740a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
7418c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
7428c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
7438c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
7448c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
7458c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
7468c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
747152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
7488c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
7498c3ff71bSJunchao Zhang }
7508c3ff71bSJunchao Zhang 
7518c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
752152b3e56SJunchao Zhang /*@C
7538c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
7548c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
7558c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
7568c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
7578c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
7588c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
7598c3ff71bSJunchao Zhang 
7608c3ff71bSJunchao Zhang    Collective
7618c3ff71bSJunchao Zhang 
7628c3ff71bSJunchao Zhang    Input Parameters:
7638c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
7648c3ff71bSJunchao Zhang .  m - number of rows
7658c3ff71bSJunchao Zhang .  n - number of columns
7668c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
7678c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
7688c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
7698c3ff71bSJunchao Zhang 
7708c3ff71bSJunchao Zhang    Output Parameter:
7718c3ff71bSJunchao Zhang .  A - the matrix
7728c3ff71bSJunchao Zhang 
7738c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
7748c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
7758c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
7768c3ff71bSJunchao Zhang 
7778c3ff71bSJunchao Zhang    Notes:
7788c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
7798c3ff71bSJunchao Zhang 
7808c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
7818c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
7828c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
7838c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
7848c3ff71bSJunchao Zhang 
7858c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
7868c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
7878c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
7888c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
7898c3ff71bSJunchao Zhang 
7908c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
7918c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
7928c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
7938c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
7948c3ff71bSJunchao Zhang 
7958c3ff71bSJunchao Zhang    Level: intermediate
7968c3ff71bSJunchao Zhang 
7978c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
7988c3ff71bSJunchao Zhang @*/
7998c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8008c3ff71bSJunchao Zhang {
8018c3ff71bSJunchao Zhang   PetscErrorCode ierr;
8028c3ff71bSJunchao Zhang 
8038c3ff71bSJunchao Zhang   PetscFunctionBegin;
8048c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
8058c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8068c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8078c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
8088c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8098c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
8108c3ff71bSJunchao Zhang }
811930e68a5SMark Adams 
812930e68a5SMark Adams // factorizations
8138f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
8148f7e8f9dSMark Adams //
8158f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
8168f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
8178f7e8f9dSMark Adams //
8188f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
819930e68a5SMark Adams {
8208f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
8218f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
8228f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
823930e68a5SMark Adams   PetscErrorCode     ierr;
8248f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
8258f7e8f9dSMark Adams   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
8268f7e8f9dSMark Adams   const PetscScalar  *aa_d = aijkok->a_d.data();
8278f7e8f9dSMark Adams   PetscScalar        *ba_d = baijkok->a_d.data();
8288f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
8298f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
8308f7e8f9dSMark Adams   PetscContainer     container;
831930e68a5SMark Adams 
832930e68a5SMark Adams   PetscFunctionBegin;
8338f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
8348f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
8358f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
8368f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
8378f7e8f9dSMark Adams   if (container) {
8388f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
8398f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
8408f7e8f9dSMark Adams     Nf = (*pNf)%1000;
8418f7e8f9dSMark Adams     nv = (*pNf)/1000;
8428f7e8f9dSMark Adams     if (nv>0) nVec = nv;
8438f7e8f9dSMark Adams   } else Nf = 1;
8448f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
8458f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
8468f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
8478f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
8488f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
8498f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
8508f7e8f9dSMark Adams #endif
8518f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
8528f7e8f9dSMark Adams   {
8538f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
8548f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
8558f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
8568f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
8578f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
8588f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
8598f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
8608f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
8618f7e8f9dSMark Adams     size_t flops_h = 0.0;
8628f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
8638f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
8648f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
8658f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
8668f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
8678f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
8688f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
8698f7e8f9dSMark Adams     // Fill A --> fact
8708f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
8718f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
8728f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
8738f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
8748f7e8f9dSMark Adams         // zero rows of B
8758f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
8768f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
8778f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
8788f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
8798f7e8f9dSMark Adams             /* zero (unfactored row) */
8808f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
8818f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
8828f7e8f9dSMark Adams           });
8838f7e8f9dSMark Adams         // copy A into B
8848f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
8858f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
8868f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
8878f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
8888f7e8f9dSMark Adams             /* load in initial (unfactored row) */
8898f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
8908f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
8918f7e8f9dSMark Adams               PetscScalar vala = av[j];
8928f7e8f9dSMark Adams               if (colb == rowb) {
8938f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
8948f7e8f9dSMark Adams               } else {
8958f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
8968f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
8978f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
8988f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
8998f7e8f9dSMark Adams                   if (pbj[j] == colb) {
9008f7e8f9dSMark Adams                     pba[j] = vala;
9018f7e8f9dSMark Adams                     set++;
9028f7e8f9dSMark Adams                     break;
9038f7e8f9dSMark Adams                   }
9048f7e8f9dSMark Adams                 }
9058f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
9068f7e8f9dSMark Adams               }
9078f7e8f9dSMark Adams             }
9088f7e8f9dSMark Adams           });
9098f7e8f9dSMark Adams       });
9108f7e8f9dSMark Adams     Kokkos::fence();
911930e68a5SMark Adams 
9128f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
9138f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
9148f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
9158f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
9168f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
9178f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
9188f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
9198f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
9208f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
9218f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
9228f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
9238f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
9248f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
9258f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
9268f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
9278f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
9288f7e8f9dSMark Adams               else {
9298f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
9308f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
9318f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
9328f7e8f9dSMark Adams                 size_t         st_idx;
9338f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
9348f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
9358f7e8f9dSMark Adams                 // get column, there has got to be a better way
9368f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
9378f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
9388f7e8f9dSMark Adams                     if (pjL[j] == ii) {
9398f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
9408f7e8f9dSMark Adams                       idx = j; // output
9418f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
9428f7e8f9dSMark Adams                     }
9438f7e8f9dSMark Adams                   }, st_idx);
9448f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
9458f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
9468f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
9478f7e8f9dSMark Adams                   // U(i+1,:end)
9488f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
9498f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
9508f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
9518f7e8f9dSMark Adams                       if (col==myk) {
9528f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
9538f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
9548f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
9558f7e8f9dSMark Adams                       } else {
9568f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
9578f7e8f9dSMark Adams                         PetscInt       high, low;
9588f7e8f9dSMark Adams                         const PetscInt *startj;
9598f7e8f9dSMark Adams                         if (col<myk) { // L
9608f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
9618f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
9628f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
9638f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
9648f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
9658f7e8f9dSMark Adams                         } else {
9668f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
9678f7e8f9dSMark Adams                           start = ba_d + idx;
9688f7e8f9dSMark Adams                           startj= bj_d + idx;
9698f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
9708f7e8f9dSMark Adams                         }
9718f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
9728f7e8f9dSMark Adams                         low  = 0;
9738f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
9748f7e8f9dSMark Adams                         while (high-low > 5) {
9758f7e8f9dSMark Adams                           int t = (low+high)/2;
9768f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
9778f7e8f9dSMark Adams                           else                 low  = t;
9788f7e8f9dSMark Adams                         }
9798f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
9808f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
9818f7e8f9dSMark Adams                         }
9828f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
9838f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
9848f7e8f9dSMark Adams                       }
9858f7e8f9dSMark Adams                     });
9868f7e8f9dSMark Adams                 }
9878f7e8f9dSMark Adams               }
9888f7e8f9dSMark Adams             });
9898f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
9908f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
9918f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
9928f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
9938f7e8f9dSMark Adams       });
9948f7e8f9dSMark Adams     Kokkos::fence();
9958f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
9968f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
9978f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
9988f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
9998f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
10008f7e8f9dSMark Adams #endif
10018f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
10028f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
10038f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
10048f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
10058f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
10068f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
10078f7e8f9dSMark Adams             if (i < end) {
10088f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
10098f7e8f9dSMark Adams               *pv = 1.0/(*pv);
10108f7e8f9dSMark Adams             }
10118f7e8f9dSMark Adams           });
10128f7e8f9dSMark Adams       });
10138f7e8f9dSMark Adams   }
10148f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
10158f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
10168f7e8f9dSMark Adams #endif
10178f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
10188f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
10198f7e8f9dSMark Adams 
10208f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
10218f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
10228f7e8f9dSMark Adams   if (b->inode.size) {
10238f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
10248f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
10258f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
10268f7e8f9dSMark Adams   } else {
10278f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
10288f7e8f9dSMark Adams   }
10298f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
10308f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
10318f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
10328f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
10338f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
10348f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
10358f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
10368f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
10378f7e8f9dSMark Adams 
1038930e68a5SMark Adams   PetscFunctionReturn(0);
1039930e68a5SMark Adams }
1040930e68a5SMark Adams 
1041930e68a5SMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1042930e68a5SMark Adams {
1043930e68a5SMark Adams   PetscErrorCode   ierr;
1044930e68a5SMark Adams 
1045930e68a5SMark Adams   PetscFunctionBegin;
1046930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1047930e68a5SMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
1048930e68a5SMark Adams   PetscFunctionReturn(0);
1049930e68a5SMark Adams }
1050930e68a5SMark Adams 
10518f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
10528f7e8f9dSMark Adams {
10538f7e8f9dSMark Adams   PetscErrorCode   ierr;
10548f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
10558f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1056930e68a5SMark Adams 
10578f7e8f9dSMark Adams   PetscFunctionBegin;
10588f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
10598f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
10608f7e8f9dSMark Adams   // move B data into Kokkos
10618f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
10628f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
10638f7e8f9dSMark Adams   {
10648f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
10658f7e8f9dSMark Adams     if (!baijkok->diag_d) {
10668f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1067152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
10688f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
10698f7e8f9dSMark Adams     }
10708f7e8f9dSMark Adams   }
10718f7e8f9dSMark Adams   PetscFunctionReturn(0);
10728f7e8f9dSMark Adams }
10738f7e8f9dSMark Adams 
10748f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
10758f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*);
1076930e68a5SMark Adams 
1077930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1078930e68a5SMark Adams {
1079930e68a5SMark Adams   PetscErrorCode ierr;
1080930e68a5SMark Adams 
1081930e68a5SMark Adams   PetscFunctionBegin;
1082930e68a5SMark Adams   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
10838f7e8f9dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1084930e68a5SMark Adams   PetscFunctionReturn(0);
1085930e68a5SMark Adams }
1086930e68a5SMark Adams 
1087930e68a5SMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
1088930e68a5SMark Adams {
1089930e68a5SMark Adams   PetscFunctionBegin;
1090930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1091930e68a5SMark Adams   PetscFunctionReturn(0);
1092930e68a5SMark Adams }
1093930e68a5SMark Adams 
10948f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice
10958f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
10968f7e8f9dSMark Adams {
10978f7e8f9dSMark Adams   PetscFunctionBegin;
10988f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
10998f7e8f9dSMark Adams   PetscFunctionReturn(0);
11008f7e8f9dSMark Adams }
11018f7e8f9dSMark Adams 
1102930e68a5SMark Adams /*MC
1103930e68a5SMark Adams   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1104930e68a5SMark Adams   on a single GPU of type, seqaijkokkos, aijkokkos.
1105930e68a5SMark Adams 
1106930e68a5SMark Adams   Level: beginner
1107930e68a5SMark Adams 
1108930e68a5SMark Adams .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
1109930e68a5SMark Adams M*/
1110930e68a5SMark Adams 
1111930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
1112930e68a5SMark Adams {
1113930e68a5SMark Adams   PetscErrorCode ierr;
1114930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1115930e68a5SMark Adams 
1116930e68a5SMark Adams   PetscFunctionBegin;
1117930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1118930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1119930e68a5SMark Adams   (*B)->factortype = ftype;
1120*f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1121930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1122930e68a5SMark Adams 
11238f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1124930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1125930e68a5SMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
1126930e68a5SMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1127930e68a5SMark Adams 
1128930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1129930e68a5SMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
1130930e68a5SMark Adams   PetscFunctionReturn(0);
1131930e68a5SMark Adams }
11328f7e8f9dSMark Adams 
11338f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
11348f7e8f9dSMark Adams {
11358f7e8f9dSMark Adams   PetscErrorCode ierr;
11368f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
11378f7e8f9dSMark Adams 
11388f7e8f9dSMark Adams   PetscFunctionBegin;
11398f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
11408f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
11418f7e8f9dSMark Adams   (*B)->factortype = ftype;
1142*f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
11438f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11448f7e8f9dSMark Adams 
11458f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
11468f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
11478f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
11488f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
11498f7e8f9dSMark Adams 
11508f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
11518f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
11528f7e8f9dSMark Adams   PetscFunctionReturn(0);
11538f7e8f9dSMark Adams }
1154