xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 042217e806e4e3ebc5975c28dbf27a273923695f)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
38c3ff71bSJunchao Zhang #include <petscsystypes.h>
48c3ff71bSJunchao Zhang #include <petscerror.h>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1086a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1186a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
1286a27549SJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
148c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
158c3ff71bSJunchao Zhang 
168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
198c3ff71bSJunchao Zhang {
208c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
21a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
228c3ff71bSJunchao Zhang 
238c3ff71bSJunchao Zhang   PetscFunctionBegin;
248c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
2586a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
26a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
27a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
28a587d139SMark   }
298c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
308c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
318c3ff71bSJunchao Zhang }
328c3ff71bSJunchao Zhang 
3386a27549SJunchao Zhang /* Sync CSR data to device if not yet */
348c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
358c3ff71bSJunchao Zhang {
36152b3e56SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
378c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
398c3ff71bSJunchao Zhang 
408c3ff71bSJunchao Zhang   PetscFunctionBegin;
4186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
42152b3e56SJunchao Zhang   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
438c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
448c3ff71bSJunchao Zhang     delete aijkok;
458c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
468c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
478c3ff71bSJunchao Zhang     A->spptr             = aijkok;
488c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
4986a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
50152b3e56SJunchao Zhang     aijkok->a_dual.modify_host(); /* Mark host is modified */
51152b3e56SJunchao Zhang     aijkok->a_dual.sync_device(); /* Sync the device */
5286a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE;
5386a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
548c3ff71bSJunchao Zhang   }
558c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
578c3ff71bSJunchao Zhang }
588c3ff71bSJunchao Zhang 
5986a27549SJunchao Zhang /* Mark the CSR data on device is modified */
6086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
6186a27549SJunchao Zhang {
6286a27549SJunchao Zhang   PetscErrorCode   ierr;
6386a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
6486a27549SJunchao Zhang 
6586a27549SJunchao Zhang   PetscFunctionBegin;
6686a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
6786a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
6886a27549SJunchao Zhang   aijkok->a_dual.modify_device();
6986a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
7086a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
7186a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_GPU;
7286a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
7386a27549SJunchao Zhang   PetscFunctionReturn(0);
7486a27549SJunchao Zhang }
7586a27549SJunchao Zhang 
76f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
77f0cf5187SStefano Zampini {
78f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
79f0cf5187SStefano Zampini 
80f0cf5187SStefano Zampini   PetscFunctionBegin;
81f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
8286a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
8386a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
84f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
85f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
8686a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
8786a27549SJunchao Zhang     aijkok->a_dual.modify_device(); /* Mark device is modified */
8886a27549SJunchao Zhang     aijkok->a_dual.sync_host(); /* Sync the host */
89f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
90f0cf5187SStefano Zampini   }
91f0cf5187SStefano Zampini   PetscFunctionReturn(0);
92f0cf5187SStefano Zampini }
93f0cf5187SStefano Zampini 
94f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
95f0cf5187SStefano Zampini {
96f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
97f0cf5187SStefano Zampini   PetscErrorCode ierr;
98f0cf5187SStefano Zampini 
99f0cf5187SStefano Zampini   PetscFunctionBegin;
100f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
101f0cf5187SStefano Zampini   *array = a->a;
102f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
103f0cf5187SStefano Zampini   PetscFunctionReturn(0);
104f0cf5187SStefano Zampini }
105f0cf5187SStefano Zampini 
106a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
107*042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
108a587d139SMark {
109*042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
110*042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
111a587d139SMark 
112a587d139SMark   PetscFunctionBegin;
113a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
114152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
115a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
116a587d139SMark   PetscFunctionReturn(0);
117a587d139SMark }
118a587d139SMark 
119a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
120*042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
121a587d139SMark {
122*042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
123a587d139SMark 
124a587d139SMark   PetscFunctionBegin;
125a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
126a587d139SMark     *d_mat = aijkok->device_mat_d.data();
127a587d139SMark   } else {
128a587d139SMark     PetscErrorCode   ierr;
129a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
130a587d139SMark     *d_mat  = NULL;
131a587d139SMark   }
132a587d139SMark   PetscFunctionReturn(0);
133a587d139SMark }
134152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
135152b3e56SJunchao Zhang {
136152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
137152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
138152b3e56SJunchao Zhang 
139152b3e56SJunchao Zhang   PetscFunctionBegin;
140152b3e56SJunchao Zhang   if (!aijkok->At) { /* Generate At for the first time */
141152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
142152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
14386a27549SJunchao Zhang   } else if (!aijkok->transpose_updated) { /* Only update At values */
144152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
145152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
146152b3e56SJunchao Zhang   }
14786a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_TRUE;
148152b3e56SJunchao Zhang   PetscFunctionReturn(0);
149152b3e56SJunchao Zhang }
150152b3e56SJunchao Zhang 
15186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
152152b3e56SJunchao Zhang {
153152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
154152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
155152b3e56SJunchao Zhang 
156152b3e56SJunchao Zhang   PetscFunctionBegin;
157152b3e56SJunchao Zhang   if (!aijkok->Ah) { /* Generate Ah for the first time */
158152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
159152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
160152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
16186a27549SJunchao Zhang   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
162152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
163152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
164152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
165152b3e56SJunchao Zhang   }
16686a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_TRUE;
167152b3e56SJunchao Zhang   PetscFunctionReturn(0);
168152b3e56SJunchao Zhang }
169a587d139SMark 
1708c3ff71bSJunchao Zhang /* y = A x */
1718c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1728c3ff71bSJunchao Zhang {
1738c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1748c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
175152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
176152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
1778c3ff71bSJunchao Zhang 
1788c3ff71bSJunchao Zhang   PetscFunctionBegin;
1798c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
180152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
181152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
1828c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
183152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
184152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
185152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
186bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
187152b3e56SJunchao Zhang   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
188152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
1898c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1908c3ff71bSJunchao Zhang }
1918c3ff71bSJunchao Zhang 
1928c3ff71bSJunchao Zhang /* y = A^T x */
1938c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1948c3ff71bSJunchao Zhang {
1958c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1968c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
197152b3e56SJunchao Zhang   Mat                              B;
198152b3e56SJunchao Zhang   const char                       *mode;
199152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
200152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2018c3ff71bSJunchao Zhang 
2028c3ff71bSJunchao Zhang   PetscFunctionBegin;
2038c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
204152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
205152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
206152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
207152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
208152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
209152b3e56SJunchao Zhang     mode = "N";
210152b3e56SJunchao Zhang   } else {
211152b3e56SJunchao Zhang     B    = A;
212152b3e56SJunchao Zhang     mode = "T";
213152b3e56SJunchao Zhang   }
214152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
215152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
216152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
217152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
218bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
219152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2218c3ff71bSJunchao Zhang }
2228c3ff71bSJunchao Zhang 
2238c3ff71bSJunchao Zhang /* y = A^H x */
2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2258c3ff71bSJunchao Zhang {
2268c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2278c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
228152b3e56SJunchao Zhang   Mat                              B;
229152b3e56SJunchao Zhang   const char                       *mode;
230152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
231152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2328c3ff71bSJunchao Zhang 
2338c3ff71bSJunchao Zhang   PetscFunctionBegin;
2348c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
235152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
236152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
237152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
23886a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
239152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
240152b3e56SJunchao Zhang     mode = "N";
241152b3e56SJunchao Zhang   } else {
242152b3e56SJunchao Zhang     B    = A;
243152b3e56SJunchao Zhang     mode = "C";
244152b3e56SJunchao Zhang   }
245152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
246152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
247152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
248152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
249bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
250152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2528c3ff71bSJunchao Zhang }
2538c3ff71bSJunchao Zhang 
2548c3ff71bSJunchao Zhang /* z = A x + y */
2558c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
2568c3ff71bSJunchao Zhang {
2578c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
259152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
260152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2618c3ff71bSJunchao Zhang 
2628c3ff71bSJunchao Zhang   PetscFunctionBegin;
2638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
264152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
265152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
266152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2678c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2688c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
270152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
271152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
272152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
273bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2758c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2768c3ff71bSJunchao Zhang }
2778c3ff71bSJunchao Zhang 
2788c3ff71bSJunchao Zhang /* z = A^T x + y */
2798c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2808c3ff71bSJunchao Zhang {
2818c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2828c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
283152b3e56SJunchao Zhang   Mat                              B;
284152b3e56SJunchao Zhang   const char                       *mode;
285152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
286152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2878c3ff71bSJunchao Zhang 
2888c3ff71bSJunchao Zhang   PetscFunctionBegin;
2898c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
290152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
291152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
292152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2938c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
294152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
295152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
296152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
297152b3e56SJunchao Zhang     mode = "N";
298152b3e56SJunchao Zhang   } else {
299152b3e56SJunchao Zhang     B    = A;
300152b3e56SJunchao Zhang     mode = "T";
301152b3e56SJunchao Zhang   }
302152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
303152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
304152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
305152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
306152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
307bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
308152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3098c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3108c3ff71bSJunchao Zhang }
3118c3ff71bSJunchao Zhang 
3128c3ff71bSJunchao Zhang /* z = A^H x + y */
3138c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3148c3ff71bSJunchao Zhang {
3158c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3168c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
317152b3e56SJunchao Zhang   Mat                              B;
318152b3e56SJunchao Zhang   const char                       *mode;
319152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
320152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3218c3ff71bSJunchao Zhang 
3228c3ff71bSJunchao Zhang   PetscFunctionBegin;
3238c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
324152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
325152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
326152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3278c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
328152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
32986a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
330152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
331152b3e56SJunchao Zhang     mode = "N";
332152b3e56SJunchao Zhang   } else {
333152b3e56SJunchao Zhang     B    = A;
334152b3e56SJunchao Zhang     mode = "C";
335152b3e56SJunchao Zhang   }
336152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
337152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
338152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
340152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
341bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
342152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
343152b3e56SJunchao Zhang   PetscFunctionReturn(0);
344152b3e56SJunchao Zhang }
345152b3e56SJunchao Zhang 
346152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
347152b3e56SJunchao Zhang {
348152b3e56SJunchao Zhang   PetscErrorCode            ierr;
349152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
350152b3e56SJunchao Zhang 
351152b3e56SJunchao Zhang   PetscFunctionBegin;
352152b3e56SJunchao Zhang   switch (op) {
353152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
354152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
355152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
356152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
357152b3e56SJunchao Zhang       break;
358152b3e56SJunchao Zhang     default:
359152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
360152b3e56SJunchao Zhang       break;
361152b3e56SJunchao Zhang   }
3628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3638c3ff71bSJunchao Zhang }
3648c3ff71bSJunchao Zhang 
3653d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3668c3ff71bSJunchao Zhang {
3678c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3688c3ff71bSJunchao Zhang   Mat            B;
369c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
3708c3ff71bSJunchao Zhang 
3718c3ff71bSJunchao Zhang   PetscFunctionBegin;
372a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
3738c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
3748c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3758c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
3768c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3778c3ff71bSJunchao Zhang   }
3788c3ff71bSJunchao Zhang 
3798c3ff71bSJunchao Zhang   B    = *newmat;
3808c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
38186a27549SJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
38286a27549SJunchao Zhang 
3838c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
3840d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
385f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
386c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
387c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
388c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
3898c3ff71bSJunchao Zhang 
3908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3918c3ff71bSJunchao Zhang }
3928c3ff71bSJunchao Zhang 
3938c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
3948c3ff71bSJunchao Zhang {
3958c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3968c3ff71bSJunchao Zhang 
3978c3ff71bSJunchao Zhang   PetscFunctionBegin;
3988c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3998c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
4008c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4018c3ff71bSJunchao Zhang }
4028c3ff71bSJunchao Zhang 
4038c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
4048c3ff71bSJunchao Zhang {
4058c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
40686a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
4078c3ff71bSJunchao Zhang 
4088c3ff71bSJunchao Zhang   PetscFunctionBegin;
40986a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
41086a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
4118f7e8f9dSMark Adams     if (aijkok) {
4128f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
413a587d139SMark         delete aijkok->colmap_d;
414a587d139SMark         delete aijkok->i_uncompressed_d;
415a587d139SMark       }
4168f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
4178f7e8f9dSMark Adams     }
4188c3ff71bSJunchao Zhang     delete aijkok;
41986a27549SJunchao Zhang   } else {
42086a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
42186a27549SJunchao Zhang   }
422f0cf5187SStefano Zampini   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
423152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
424152b3e56SJunchao Zhang   A->spptr = NULL;
4258c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
4268c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4278c3ff71bSJunchao Zhang }
4288c3ff71bSJunchao Zhang 
42986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
43086a27549SJunchao Zhang {
43186a27549SJunchao Zhang   PetscErrorCode ierr;
43286a27549SJunchao Zhang 
43386a27549SJunchao Zhang   PetscFunctionBegin;
43486a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
43586a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
43686a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
43786a27549SJunchao Zhang   PetscFunctionReturn(0);
43886a27549SJunchao Zhang }
43986a27549SJunchao Zhang 
440a3f881fbSStefano Zampini #if 0
441a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
442a3f881fbSStefano Zampini {
443a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
444a3f881fbSStefano Zampini 
445a3f881fbSStefano Zampini   PetscFunctionBegin;
446a3f881fbSStefano Zampini   delete kh;
447a3f881fbSStefano Zampini   PetscFunctionReturn(0);
448a3f881fbSStefano Zampini }
449a3f881fbSStefano Zampini 
450a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
451a3f881fbSStefano Zampini {
452a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
453a3f881fbSStefano Zampini   Mat                  A,B;
454a3f881fbSStefano Zampini   MatProductType       ptype;
455a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
456a3f881fbSStefano Zampini   bool                 tA,tB;
457a3f881fbSStefano Zampini   PetscErrorCode       ierr;
458a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
459a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
460a3f881fbSStefano Zampini 
461a3f881fbSStefano Zampini   PetscFunctionBegin;
462a3f881fbSStefano Zampini   MatCheckProduct(C,1);
463a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
464a3f881fbSStefano Zampini   A = product->A;
465a3f881fbSStefano Zampini   B = product->B;
466a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
467a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
468a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
469a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
470a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
471a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
472a3f881fbSStefano Zampini   ptype = product->type;
473a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
474a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
475a3f881fbSStefano Zampini   switch (ptype) {
476a3f881fbSStefano Zampini   case MATPRODUCT_AB:
477a3f881fbSStefano Zampini     tA = false;
478a3f881fbSStefano Zampini     tB = false;
479a3f881fbSStefano Zampini     break;
480a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
481a3f881fbSStefano Zampini     tA = true;
482a3f881fbSStefano Zampini     tB = false;
483a3f881fbSStefano Zampini     break;
484a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
485a3f881fbSStefano Zampini     tA = false;
486a3f881fbSStefano Zampini     tB = true;
487a3f881fbSStefano Zampini     break;
488a3f881fbSStefano Zampini   default:
489a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
490a3f881fbSStefano Zampini   }
491a3f881fbSStefano Zampini 
492a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
493a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
494a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
495a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
496a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
497a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
498a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
499a3f881fbSStefano Zampini   c->reallocs         = 0;
500a3f881fbSStefano Zampini   C->info.mallocs    += 0;
501a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
502a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
503a3f881fbSStefano Zampini   C->num_ass++;
504a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
505a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
506a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
507a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
508a3f881fbSStefano Zampini   PetscFunctionReturn(0);
509a3f881fbSStefano Zampini }
510a3f881fbSStefano Zampini 
511a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
512a3f881fbSStefano Zampini {
513a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
514a3f881fbSStefano Zampini   Mat                  A,B;
515a3f881fbSStefano Zampini   MatProductType       ptype;
516a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
517a3f881fbSStefano Zampini   PetscInt             m,n,k;
518a3f881fbSStefano Zampini   bool                 tA,tB;
519a3f881fbSStefano Zampini   PetscErrorCode       ierr;
520a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
521a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
522a3f881fbSStefano Zampini 
523a3f881fbSStefano Zampini   PetscFunctionBegin;
524a3f881fbSStefano Zampini   MatCheckProduct(C,1);
525a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
526a3f881fbSStefano Zampini   A = product->A;
527a3f881fbSStefano Zampini   B = product->B;
528a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
529a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
530a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
531a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
532a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
533a3f881fbSStefano Zampini   ptype = product->type;
534a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
535a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
536a3f881fbSStefano Zampini   switch (ptype) {
537a3f881fbSStefano Zampini   case MATPRODUCT_AB:
538a3f881fbSStefano Zampini     tA = false;
539a3f881fbSStefano Zampini     tB = false;
540a3f881fbSStefano Zampini     m = A->rmap->n;
541a3f881fbSStefano Zampini     n = B->cmap->n;
542a3f881fbSStefano Zampini     break;
543a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
544a3f881fbSStefano Zampini     tA = true;
545a3f881fbSStefano Zampini     tB = false;
546a3f881fbSStefano Zampini     m = A->cmap->n;
547a3f881fbSStefano Zampini     n = B->cmap->n;
548a3f881fbSStefano Zampini     break;
549a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
550a3f881fbSStefano Zampini     tA = false;
551a3f881fbSStefano Zampini     tB = true;
552a3f881fbSStefano Zampini     m = A->rmap->n;
553a3f881fbSStefano Zampini     n = B->rmap->n;
554a3f881fbSStefano Zampini     break;
555a3f881fbSStefano Zampini   default:
556a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
557a3f881fbSStefano Zampini   }
558a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
559a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
560a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
561a3f881fbSStefano Zampini 
562a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
563a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
564a3f881fbSStefano Zampini   kh->set_team_work_size(16);
565a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
566a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
567a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
568a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
569a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
570a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
571a3f881fbSStefano Zampini 
572a3f881fbSStefano Zampini   // TODO JZ
573a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
574a3f881fbSStefano Zampini   C->spptr = ckok;
575a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
576a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
577*042217e8SBarry Smith 
578a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
579a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
580a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
581a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
582a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
583a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
584*042217e8SBarry Smith 
585a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
586*042217e8SBarry Smith 
587a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
588a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
589a3f881fbSStefano Zampini   c->maxnz = c->nz;
590a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
591a3f881fbSStefano Zampini   c->rmax = 0;
592a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
593a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
594a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
595a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
596a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
597a3f881fbSStefano Zampini   }
598a3f881fbSStefano Zampini 
599a3f881fbSStefano Zampini   C->nonzerostate++;
600a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
601a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
602a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
603a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
604a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
605a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
606a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
607a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
608a3f881fbSStefano Zampini 
609a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
610a3f881fbSStefano Zampini   C->product->data = kh;
611a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
612a3f881fbSStefano Zampini   PetscFunctionReturn(0);
613a3f881fbSStefano Zampini }
614a3f881fbSStefano Zampini 
615a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
616a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
617a3f881fbSStefano Zampini {
618a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
619a3f881fbSStefano Zampini   PetscErrorCode ierr;
620a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
621a3f881fbSStefano Zampini 
622a3f881fbSStefano Zampini   PetscFunctionBegin;
623a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
624a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
625a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
626a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
627a3f881fbSStefano Zampini   }
628a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
629a3f881fbSStefano Zampini     switch (product->type) {
630a3f881fbSStefano Zampini     case MATPRODUCT_AB:
631a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
632a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
633a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
634a3f881fbSStefano Zampini       break;
635a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
636a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
637a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
638a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
639a3f881fbSStefano Zampini       break;
640a3f881fbSStefano Zampini     default:
641a3f881fbSStefano Zampini       break;
642a3f881fbSStefano Zampini     }
643a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
644a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
645a3f881fbSStefano Zampini   }
646a3f881fbSStefano Zampini   PetscFunctionReturn(0);
647a3f881fbSStefano Zampini }
648a3f881fbSStefano Zampini #endif
649a3f881fbSStefano Zampini 
650a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
651a587d139SMark {
652a587d139SMark   PetscErrorCode    ierr;
653a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
654a587d139SMark   PetscFunctionBegin;
65586a27549SJunchao Zhang   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
656a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
657a587d139SMark   PetscFunctionReturn(0);
658a587d139SMark }
659a587d139SMark 
660f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
661f0cf5187SStefano Zampini {
662f0cf5187SStefano Zampini   PetscErrorCode   ierr;
663f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
664f0cf5187SStefano Zampini 
665f0cf5187SStefano Zampini   PetscFunctionBegin;
666f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
667f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
668f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
669f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
670f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
671f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
672f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
673f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
674f0cf5187SStefano Zampini   PetscFunctionReturn(0);
675f0cf5187SStefano Zampini }
676f0cf5187SStefano Zampini 
677a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
678a587d139SMark {
679a587d139SMark   PetscErrorCode   ierr;
680a587d139SMark   PetscBool        both = PETSC_FALSE;
681a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
682a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
683a587d139SMark 
684a587d139SMark   PetscFunctionBegin;
685a587d139SMark   if (aijkok && aijkok->a_d.data()) {
686f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
687a587d139SMark     both = PETSC_TRUE;
688a587d139SMark   }
689a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
690a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
691a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
692a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
693a587d139SMark   PetscFunctionReturn(0);
694a587d139SMark }
695a587d139SMark 
696db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
697db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
698db78de30SJunchao Zhang {
699db78de30SJunchao Zhang   PetscErrorCode     ierr;
700db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
701db78de30SJunchao Zhang 
702db78de30SJunchao Zhang   PetscFunctionBegin;
703db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
704db78de30SJunchao Zhang   PetscValidPointer(kv,2);
705db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
706db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708db78de30SJunchao Zhang   *kv    = aijkok->a_d;
709db78de30SJunchao Zhang   PetscFunctionReturn(0);
710db78de30SJunchao Zhang }
711db78de30SJunchao Zhang 
712db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
713db78de30SJunchao Zhang {
714db78de30SJunchao Zhang   PetscFunctionBegin;
715db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
716db78de30SJunchao Zhang   PetscValidPointer(kv,2);
717db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
718db78de30SJunchao Zhang   PetscFunctionReturn(0);
719db78de30SJunchao Zhang }
720db78de30SJunchao Zhang 
721db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
722db78de30SJunchao Zhang {
723db78de30SJunchao Zhang   PetscErrorCode     ierr;
724db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
725db78de30SJunchao Zhang 
726db78de30SJunchao Zhang   PetscFunctionBegin;
727db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
728db78de30SJunchao Zhang   PetscValidPointer(kv,2);
729db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
730db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
731db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
732db78de30SJunchao Zhang   *kv    = aijkok->a_d;
733db78de30SJunchao Zhang   PetscFunctionReturn(0);
734db78de30SJunchao Zhang }
735db78de30SJunchao Zhang 
736db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
737db78de30SJunchao Zhang {
738db78de30SJunchao Zhang   PetscErrorCode     ierr;
739db78de30SJunchao Zhang 
740db78de30SJunchao Zhang   PetscFunctionBegin;
741db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
742db78de30SJunchao Zhang   PetscValidPointer(kv,2);
743db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
744db78de30SJunchao Zhang   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
745db78de30SJunchao Zhang   PetscFunctionReturn(0);
746db78de30SJunchao Zhang }
747db78de30SJunchao Zhang 
748db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
749db78de30SJunchao Zhang {
750db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
751db78de30SJunchao Zhang 
752db78de30SJunchao Zhang   PetscFunctionBegin;
753db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
754db78de30SJunchao Zhang   PetscValidPointer(kv,2);
755db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
756db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
757db78de30SJunchao Zhang   *kv    = aijkok->a_d;
758db78de30SJunchao Zhang   PetscFunctionReturn(0);
759db78de30SJunchao Zhang }
760db78de30SJunchao Zhang 
761db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
762db78de30SJunchao Zhang {
763db78de30SJunchao Zhang   PetscErrorCode     ierr;
764db78de30SJunchao Zhang 
765db78de30SJunchao Zhang   PetscFunctionBegin;
766db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
767db78de30SJunchao Zhang   PetscValidPointer(kv,2);
768db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
769db78de30SJunchao Zhang   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
770db78de30SJunchao Zhang   PetscFunctionReturn(0);
771db78de30SJunchao Zhang }
772db78de30SJunchao Zhang 
773db78de30SJunchao Zhang /* Computes Y += a*X */
774f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
775a587d139SMark {
776a587d139SMark   PetscErrorCode ierr;
777a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
778a587d139SMark 
779a587d139SMark   PetscFunctionBegin;
780f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
781a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
782a587d139SMark     PetscFunctionReturn(0);
783a587d139SMark   }
784db78de30SJunchao Zhang 
785f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
786a587d139SMark     PetscBool e;
787a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
788a587d139SMark     if (e) {
789a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
790f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
791a587d139SMark     }
792a587d139SMark   }
793db78de30SJunchao Zhang 
794a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
795db78de30SJunchao Zhang     /* TODO: do MatAXPY on device */
796a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
797a587d139SMark     PetscFunctionReturn(0);
798a587d139SMark   } else {
799db78de30SJunchao Zhang     ConstPetscScalarKokkosView xv;
800db78de30SJunchao Zhang     PetscScalarKokkosView      yv;
801db78de30SJunchao Zhang 
802db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
803db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
804db78de30SJunchao Zhang     KokkosBlas::axpy(a,xv,yv);
805db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
806db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
807a587d139SMark   }
808a587d139SMark   PetscFunctionReturn(0);
809a587d139SMark }
810a587d139SMark 
81186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
8128f7e8f9dSMark Adams {
8138f7e8f9dSMark Adams   PetscErrorCode ierr;
8148f7e8f9dSMark Adams 
8158f7e8f9dSMark Adams   PetscFunctionBegin;
8168f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
8178f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
8188f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
8198f7e8f9dSMark Adams   PetscFunctionReturn(0);
8208f7e8f9dSMark Adams }
8218f7e8f9dSMark Adams 
8228c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
8238c3ff71bSJunchao Zhang {
8248c3ff71bSJunchao Zhang   PetscFunctionBegin;
8256f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
8266f3d89d0SStefano Zampini 
827a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
8288c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
8298c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
8308c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
831a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
832f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
833a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
834a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
8358c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
8368c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
8378c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
8388c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
8398c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
8408c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
841152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
8428c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
8438c3ff71bSJunchao Zhang }
8448c3ff71bSJunchao Zhang 
8458c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
846152b3e56SJunchao Zhang /*@C
8478c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
8488c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
8498c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
8508c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
8518c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
8528c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
8538c3ff71bSJunchao Zhang 
8548c3ff71bSJunchao Zhang    Collective
8558c3ff71bSJunchao Zhang 
8568c3ff71bSJunchao Zhang    Input Parameters:
8578c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
8588c3ff71bSJunchao Zhang .  m - number of rows
8598c3ff71bSJunchao Zhang .  n - number of columns
8608c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
8618c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
8628c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
8638c3ff71bSJunchao Zhang 
8648c3ff71bSJunchao Zhang    Output Parameter:
8658c3ff71bSJunchao Zhang .  A - the matrix
8668c3ff71bSJunchao Zhang 
8678c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8688c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8698c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8708c3ff71bSJunchao Zhang 
8718c3ff71bSJunchao Zhang    Notes:
8728c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
8738c3ff71bSJunchao Zhang 
8748c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
8758c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
8768c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
8778c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
8788c3ff71bSJunchao Zhang 
8798c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
8808c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8818c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
8828c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
8838c3ff71bSJunchao Zhang 
8848c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
8858c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
8868c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
8878c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
8888c3ff71bSJunchao Zhang 
8898c3ff71bSJunchao Zhang    Level: intermediate
8908c3ff71bSJunchao Zhang 
8918c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
8928c3ff71bSJunchao Zhang @*/
8938c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8948c3ff71bSJunchao Zhang {
8958c3ff71bSJunchao Zhang   PetscErrorCode ierr;
8968c3ff71bSJunchao Zhang 
8978c3ff71bSJunchao Zhang   PetscFunctionBegin;
8988c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
8998c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
9008c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
9018c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
9028c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
9038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
9048c3ff71bSJunchao Zhang }
905930e68a5SMark Adams 
9068f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
9078f7e8f9dSMark Adams //
9088f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
9098f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
9108f7e8f9dSMark Adams //
9118f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
912930e68a5SMark Adams {
9138f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
9148f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
9158f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
916930e68a5SMark Adams   PetscErrorCode     ierr;
9178f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
9188f7e8f9dSMark Adams   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
9198f7e8f9dSMark Adams   const PetscScalar  *aa_d = aijkok->a_d.data();
9208f7e8f9dSMark Adams   PetscScalar        *ba_d = baijkok->a_d.data();
9218f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
9228f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
9238f7e8f9dSMark Adams   PetscContainer     container;
924930e68a5SMark Adams 
925930e68a5SMark Adams   PetscFunctionBegin;
9268f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
9278f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
9288f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
9298f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
9308f7e8f9dSMark Adams   if (container) {
9318f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
9328f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
9338f7e8f9dSMark Adams     Nf = (*pNf)%1000;
9348f7e8f9dSMark Adams     nv = (*pNf)/1000;
9358f7e8f9dSMark Adams     if (nv>0) nVec = nv;
9368f7e8f9dSMark Adams   } else Nf = 1;
9378f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
9388f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
9398f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
9408f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
9418f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
9428f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
9438f7e8f9dSMark Adams #endif
9448f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
9458f7e8f9dSMark Adams   {
9468f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
9478f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
9488f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
9498f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
9508f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
9518f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
9528f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
9538f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
9548f7e8f9dSMark Adams     size_t flops_h = 0.0;
9558f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
9568f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
9578f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
9588f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
9598f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
9608f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
9618f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
9628f7e8f9dSMark Adams     // Fill A --> fact
9638f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
964*042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
9658f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
9668f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
9678f7e8f9dSMark Adams         // zero rows of B
9688f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9698f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
9708f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
9718f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
9728f7e8f9dSMark Adams             /* zero (unfactored row) */
9738f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
9748f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
9758f7e8f9dSMark Adams           });
9768f7e8f9dSMark Adams         // copy A into B
9778f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9788f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
9798f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
9808f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
9818f7e8f9dSMark Adams             /* load in initial (unfactored row) */
9828f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
9838f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
9848f7e8f9dSMark Adams               PetscScalar vala = av[j];
9858f7e8f9dSMark Adams               if (colb == rowb) {
9868f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
9878f7e8f9dSMark Adams               } else {
9888f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9898f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9908f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
9918f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
9928f7e8f9dSMark Adams                   if (pbj[j] == colb) {
9938f7e8f9dSMark Adams                     pba[j] = vala;
9948f7e8f9dSMark Adams                     set++;
9958f7e8f9dSMark Adams                     break;
9968f7e8f9dSMark Adams                   }
9978f7e8f9dSMark Adams                 }
9988f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
9998f7e8f9dSMark Adams               }
10008f7e8f9dSMark Adams             }
10018f7e8f9dSMark Adams           });
10028f7e8f9dSMark Adams       });
10038f7e8f9dSMark Adams     Kokkos::fence();
1004930e68a5SMark Adams 
10058f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
10068f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
10078f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
10088f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1009*042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
10108f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
10118f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
10128f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
10138f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
10148f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
10158f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
10168f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
10178f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
10188f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
10198f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
10208f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
10218f7e8f9dSMark Adams               else {
10228f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
10238f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
10248f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
10258f7e8f9dSMark Adams                 size_t         st_idx;
10268f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
10278f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
10288f7e8f9dSMark Adams                 // get column, there has got to be a better way
10298f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
10308f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
10318f7e8f9dSMark Adams                     if (pjL[j] == ii) {
10328f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
10338f7e8f9dSMark Adams                       idx = j; // output
10348f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
10358f7e8f9dSMark Adams                     }
10368f7e8f9dSMark Adams                   }, st_idx);
10378f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
10388f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
10398f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
10408f7e8f9dSMark Adams                   // U(i+1,:end)
10418f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
10428f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
10438f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
10448f7e8f9dSMark Adams                       if (col==myk) {
10458f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
10468f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
10478f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
10488f7e8f9dSMark Adams                       } else {
10498f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
10508f7e8f9dSMark Adams                         PetscInt       high, low;
10518f7e8f9dSMark Adams                         const PetscInt *startj;
10528f7e8f9dSMark Adams                         if (col<myk) { // L
10538f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
10548f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
10558f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
10568f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
10578f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
10588f7e8f9dSMark Adams                         } else {
10598f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
10608f7e8f9dSMark Adams                           start = ba_d + idx;
10618f7e8f9dSMark Adams                           startj= bj_d + idx;
10628f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
10638f7e8f9dSMark Adams                         }
10648f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
10658f7e8f9dSMark Adams                         low  = 0;
10668f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
10678f7e8f9dSMark Adams                         while (high-low > 5) {
10688f7e8f9dSMark Adams                           int t = (low+high)/2;
10698f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
10708f7e8f9dSMark Adams                           else                 low  = t;
10718f7e8f9dSMark Adams                         }
10728f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
10738f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
10748f7e8f9dSMark Adams                         }
10758f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
10768f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
10778f7e8f9dSMark Adams                       }
10788f7e8f9dSMark Adams                     });
10798f7e8f9dSMark Adams                 }
10808f7e8f9dSMark Adams               }
10818f7e8f9dSMark Adams             });
10828f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
10838f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
10848f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
10858f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
10868f7e8f9dSMark Adams       });
10878f7e8f9dSMark Adams     Kokkos::fence();
10888f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
10898f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
10908f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
10918f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
10928f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
10938f7e8f9dSMark Adams #endif
10948f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
10958f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
10968f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
10978f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
10988f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
10998f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
11008f7e8f9dSMark Adams             if (i < end) {
11018f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
11028f7e8f9dSMark Adams               *pv = 1.0/(*pv);
11038f7e8f9dSMark Adams             }
11048f7e8f9dSMark Adams           });
11058f7e8f9dSMark Adams       });
11068f7e8f9dSMark Adams   }
11078f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
11088f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
11098f7e8f9dSMark Adams #endif
11108f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
11118f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
11128f7e8f9dSMark Adams 
11138f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
11148f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
11158f7e8f9dSMark Adams   if (b->inode.size) {
11168f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
11178f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
11188f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
11198f7e8f9dSMark Adams   } else {
11208f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
11218f7e8f9dSMark Adams   }
11228f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
11238f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
11248f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
11258f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
11268f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
11278f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
11288f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
11298f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
11308f7e8f9dSMark Adams 
1131930e68a5SMark Adams   PetscFunctionReturn(0);
1132930e68a5SMark Adams }
1133930e68a5SMark Adams 
113486a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1135930e68a5SMark Adams {
1136930e68a5SMark Adams   PetscErrorCode   ierr;
1137930e68a5SMark Adams 
1138930e68a5SMark Adams   PetscFunctionBegin;
1139930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
114086a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
114186a27549SJunchao Zhang   PetscFunctionReturn(0);
114286a27549SJunchao Zhang }
114386a27549SJunchao Zhang 
114486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
114586a27549SJunchao Zhang {
114686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
114786a27549SJunchao Zhang 
114886a27549SJunchao Zhang   PetscFunctionBegin;
114986a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
115086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
115186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
115286a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
115386a27549SJunchao Zhang   }
115486a27549SJunchao Zhang   PetscFunctionReturn(0);
115586a27549SJunchao Zhang }
115686a27549SJunchao Zhang 
115786a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
115886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
115986a27549SJunchao Zhang {
116086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
116186a27549SJunchao Zhang   MatColumnIndexType             n = A->rmap->n;
116286a27549SJunchao Zhang 
116386a27549SJunchao Zhang   PetscFunctionBegin;
116486a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
116586a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
116686a27549SJunchao Zhang     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
116786a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
116886a27549SJunchao Zhang     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
116986a27549SJunchao Zhang     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
117086a27549SJunchao Zhang 
117186a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
117286a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
117386a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
117486a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
117586a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
117686a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
117786a27549SJunchao Zhang 
117886a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
117986a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
118086a27549SJunchao Zhang     */
118186a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
118286a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
118386a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
118486a27549SJunchao Zhang 
118586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
118686a27549SJunchao Zhang 
118786a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
118886a27549SJunchao Zhang     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
118986a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
119086a27549SJunchao Zhang     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
119186a27549SJunchao Zhang     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
119286a27549SJunchao Zhang 
119386a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
119486a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
119586a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
119686a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
119786a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
119886a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
119986a27549SJunchao Zhang 
120086a27549SJunchao Zhang     /* Sort indices. See comments above */
120186a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
120286a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
120386a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
120486a27549SJunchao Zhang 
120586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
120686a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
120786a27549SJunchao Zhang   }
120886a27549SJunchao Zhang   PetscFunctionReturn(0);
120986a27549SJunchao Zhang }
121086a27549SJunchao Zhang 
121186a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
121286a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
121386a27549SJunchao Zhang {
121486a27549SJunchao Zhang   PetscErrorCode                 ierr;
121586a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
121686a27549SJunchao Zhang   PetscScalarKokkosView          xv;
121786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
121886a27549SJunchao Zhang 
121986a27549SJunchao Zhang   PetscFunctionBegin;
122086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
122186a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
122286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
122386a27549SJunchao Zhang   /* Solve L tmpv = b */
122486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
122586a27549SJunchao Zhang   /* Solve Ux = tmpv */
122686a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
122786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
122886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
122986a27549SJunchao Zhang   PetscFunctionReturn(0);
123086a27549SJunchao Zhang }
123186a27549SJunchao Zhang 
123286a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */
123386a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
123486a27549SJunchao Zhang {
123586a27549SJunchao Zhang   PetscErrorCode                 ierr;
123686a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
123786a27549SJunchao Zhang   PetscScalarKokkosView          xv;
123886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
123986a27549SJunchao Zhang 
124086a27549SJunchao Zhang   PetscFunctionBegin;
124186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
124286a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
124386a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
124486a27549SJunchao Zhang   /* Solve U^T tmpv = b */
124586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
124686a27549SJunchao Zhang 
124786a27549SJunchao Zhang   /* Solve L^T x = tmpv */
124886a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
124986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
125086a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
125186a27549SJunchao Zhang   PetscFunctionReturn(0);
125286a27549SJunchao Zhang }
125386a27549SJunchao Zhang 
125486a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
125586a27549SJunchao Zhang {
125686a27549SJunchao Zhang   PetscErrorCode                 ierr;
125786a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
125886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
125986a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
126086a27549SJunchao Zhang 
126186a27549SJunchao Zhang   PetscFunctionBegin;
126286a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
126386a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
126486a27549SJunchao Zhang 
126586a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
126686a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
126786a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
126886a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
126986a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
127086a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
127186a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
127286a27549SJunchao Zhang 
127386a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
127486a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
127586a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
127686a27549SJunchao Zhang   PetscFunctionReturn(0);
127786a27549SJunchao Zhang }
127886a27549SJunchao Zhang 
127986a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
128086a27549SJunchao Zhang {
128186a27549SJunchao Zhang   PetscErrorCode                 ierr;
128286a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
128386a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
128486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
128586a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
128686a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
128786a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
128886a27549SJunchao Zhang 
128986a27549SJunchao Zhang   PetscFunctionBegin;
129086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
129186a27549SJunchao Zhang   /* Rebuild factors */
129286a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
129386a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
129486a27549SJunchao Zhang 
129586a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
129686a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
129786a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
129886a27549SJunchao Zhang 
129986a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
130086a27549SJunchao Zhang 
130186a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
130286a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
130386a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
130486a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
130586a27549SJunchao Zhang 
130686a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
130786a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
130886a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
130986a27549SJunchao Zhang 
131086a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
131186a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
131286a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
131386a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
131486a27549SJunchao Zhang 
131586a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
131686a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
131786a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
131886a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
131986a27549SJunchao Zhang  #else
132086a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
132186a27549SJunchao Zhang  #endif
132286a27549SJunchao Zhang 
132386a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
132486a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
132586a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
132686a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
132786a27549SJunchao Zhang 
132886a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
132986a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
133086a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
133186a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
133286a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
133386a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
133486a27549SJunchao Zhang 
133586a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
133686a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1337930e68a5SMark Adams   PetscFunctionReturn(0);
1338930e68a5SMark Adams }
1339930e68a5SMark Adams 
13408f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
13418f7e8f9dSMark Adams {
13428f7e8f9dSMark Adams   PetscErrorCode   ierr;
13438f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
13448f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1345930e68a5SMark Adams 
13468f7e8f9dSMark Adams   PetscFunctionBegin;
13478f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
13488f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
13498f7e8f9dSMark Adams   // move B data into Kokkos
13508f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
13518f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
13528f7e8f9dSMark Adams   {
13538f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13548f7e8f9dSMark Adams     if (!baijkok->diag_d) {
13558f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1356152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
13578f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
13588f7e8f9dSMark Adams     }
13598f7e8f9dSMark Adams   }
13608f7e8f9dSMark Adams   PetscFunctionReturn(0);
13618f7e8f9dSMark Adams }
13628f7e8f9dSMark Adams 
136386a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1364930e68a5SMark Adams {
1365930e68a5SMark Adams   PetscFunctionBegin;
1366930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1367930e68a5SMark Adams   PetscFunctionReturn(0);
1368930e68a5SMark Adams }
1369930e68a5SMark Adams 
13708f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
13718f7e8f9dSMark Adams {
13728f7e8f9dSMark Adams   PetscFunctionBegin;
13738f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
13748f7e8f9dSMark Adams   PetscFunctionReturn(0);
13758f7e8f9dSMark Adams }
13768f7e8f9dSMark Adams 
1377930e68a5SMark Adams /*MC
137886a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
137986a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1380930e68a5SMark Adams 
1381930e68a5SMark Adams   Level: beginner
1382930e68a5SMark Adams 
138386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1384930e68a5SMark Adams M*/
138586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1386930e68a5SMark Adams {
1387930e68a5SMark Adams   PetscErrorCode ierr;
1388930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1389930e68a5SMark Adams 
1390930e68a5SMark Adams   PetscFunctionBegin;
1391930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1392930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1393930e68a5SMark Adams   (*B)->factortype = ftype;
13944ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1395930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1396930e68a5SMark Adams 
13978f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1398930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
139986a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
140086a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
140186a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
140286a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
140386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
140486a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
140586a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1406930e68a5SMark Adams 
1407930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
140886a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1409930e68a5SMark Adams   PetscFunctionReturn(0);
1410930e68a5SMark Adams }
14118f7e8f9dSMark Adams 
14128f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
14138f7e8f9dSMark Adams {
14148f7e8f9dSMark Adams   PetscErrorCode ierr;
14158f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
14168f7e8f9dSMark Adams 
14178f7e8f9dSMark Adams   PetscFunctionBegin;
14188f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
14198f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
14208f7e8f9dSMark Adams   (*B)->factortype = ftype;
1421f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
14224ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
14238f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
14248f7e8f9dSMark Adams 
14258f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
14268f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
14278f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
14288f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
14298f7e8f9dSMark Adams 
14308f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
14318f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
14328f7e8f9dSMark Adams   PetscFunctionReturn(0);
14338f7e8f9dSMark Adams }
143486a27549SJunchao Zhang 
143586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
143686a27549SJunchao Zhang {
143786a27549SJunchao Zhang   PetscErrorCode ierr;
143886a27549SJunchao Zhang 
143986a27549SJunchao Zhang   PetscFunctionBegin;
144086a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
144186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
144286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
144386a27549SJunchao Zhang   PetscFunctionReturn(0);
144486a27549SJunchao Zhang }
144586a27549SJunchao Zhang 
1446