xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision cbc6b2250e380677eada4bbc58a36dc55ca92067)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
612c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
622c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
63076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
64076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
65580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6686a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
678c3ff71bSJunchao Zhang   }
688c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
698c3ff71bSJunchao Zhang }
708c3ff71bSJunchao Zhang 
71076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
72fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7386a27549SJunchao Zhang {
7486a27549SJunchao Zhang   PetscErrorCode   ierr;
7586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7686a27549SJunchao Zhang 
7786a27549SJunchao Zhang   PetscFunctionBegin;
782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
7986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
832328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8486a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   PetscFunctionReturn(0);
8686a27549SJunchao Zhang }
8786a27549SJunchao Zhang 
88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89f0cf5187SStefano Zampini {
90f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91f0cf5187SStefano Zampini 
92f0cf5187SStefano Zampini   PetscFunctionBegin;
93f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9486a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
952c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
962c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
98f0cf5187SStefano Zampini   PetscFunctionReturn(0);
99f0cf5187SStefano Zampini }
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102f0cf5187SStefano Zampini {
103076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104f0cf5187SStefano Zampini 
105f0cf5187SStefano Zampini   PetscFunctionBegin;
106076ba34aSJunchao Zhang   if (aijkok) {
107076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
108076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
109076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
110076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111076ba34aSJunchao Zhang   }
112076ba34aSJunchao Zhang   PetscFunctionReturn(0);
113076ba34aSJunchao Zhang }
114076ba34aSJunchao Zhang 
115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116076ba34aSJunchao Zhang {
117076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118076ba34aSJunchao Zhang 
119076ba34aSJunchao Zhang   PetscFunctionBegin;
120076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
121076ba34aSJunchao Zhang   PetscFunctionReturn(0);
122076ba34aSJunchao Zhang }
123076ba34aSJunchao Zhang 
124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125076ba34aSJunchao Zhang {
126076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127076ba34aSJunchao Zhang 
128076ba34aSJunchao Zhang   PetscFunctionBegin;
1292328674fSJunchao Zhang   if (aijkok) {
130076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
131076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1322328674fSJunchao Zhang   } else {
1332328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1342328674fSJunchao Zhang   }
135076ba34aSJunchao Zhang   PetscFunctionReturn(0);
136076ba34aSJunchao Zhang }
137076ba34aSJunchao Zhang 
138076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
139076ba34aSJunchao Zhang {
140076ba34aSJunchao Zhang   PetscFunctionBegin;
141076ba34aSJunchao Zhang   *array = NULL;
142076ba34aSJunchao Zhang   PetscFunctionReturn(0);
143076ba34aSJunchao Zhang }
144076ba34aSJunchao Zhang 
145076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
146076ba34aSJunchao Zhang {
147076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
148076ba34aSJunchao Zhang 
149076ba34aSJunchao Zhang   PetscFunctionBegin;
1502328674fSJunchao Zhang   if (aijkok) {
151076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1522328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1532328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1542328674fSJunchao Zhang   }
155076ba34aSJunchao Zhang   PetscFunctionReturn(0);
156076ba34aSJunchao Zhang }
157076ba34aSJunchao Zhang 
158076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
159076ba34aSJunchao Zhang {
160076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
161076ba34aSJunchao Zhang 
162076ba34aSJunchao Zhang   PetscFunctionBegin;
1632328674fSJunchao Zhang   if (aijkok) {
164076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
165076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1662328674fSJunchao Zhang   }
167f0cf5187SStefano Zampini   PetscFunctionReturn(0);
168f0cf5187SStefano Zampini }
169f0cf5187SStefano Zampini 
170a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
171042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
172a587d139SMark {
173042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
174042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
175a587d139SMark 
176a587d139SMark   PetscFunctionBegin;
1772c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
178152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
179a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
180a587d139SMark   PetscFunctionReturn(0);
181a587d139SMark }
182a587d139SMark 
183a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
184042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
185a587d139SMark {
186042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
187a587d139SMark 
188a587d139SMark   PetscFunctionBegin;
189a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
190a587d139SMark     *d_mat = aijkok->device_mat_d.data();
191a587d139SMark   } else {
192a587d139SMark     PetscErrorCode   ierr;
193a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
194a587d139SMark     *d_mat  = NULL;
195a587d139SMark   }
196a587d139SMark   PetscFunctionReturn(0);
197a587d139SMark }
198076ba34aSJunchao Zhang 
199076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
200076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
201152b3e56SJunchao Zhang {
202152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
203152b3e56SJunchao Zhang 
204152b3e56SJunchao Zhang   PetscFunctionBegin;
2052c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
206076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
207076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
208076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
209076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
210076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21186a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
212076ba34aSJunchao Zhang   }
213076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
214152b3e56SJunchao Zhang   PetscFunctionReturn(0);
215152b3e56SJunchao Zhang }
216152b3e56SJunchao Zhang 
217076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
218076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
219152b3e56SJunchao Zhang {
220eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
221152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222152b3e56SJunchao Zhang 
223152b3e56SJunchao Zhang   PetscFunctionBegin;
224eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2252c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
226076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
227076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
228076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
230076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
231076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
232076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
233076ba34aSJunchao Zhang    #endif
23486a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
235076ba34aSJunchao Zhang   }
236076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
237eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
238152b3e56SJunchao Zhang   PetscFunctionReturn(0);
239152b3e56SJunchao Zhang }
240a587d139SMark 
2418c3ff71bSJunchao Zhang /* y = A x */
2428c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2438c3ff71bSJunchao Zhang {
2448c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2458c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
246152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
247152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2488c3ff71bSJunchao Zhang 
2498c3ff71bSJunchao Zhang   PetscFunctionBegin;
2506af1d01cSJed Brown   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2518c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253ad7ac7b2SJunchao Zhang   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
2548c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
256152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
257ad7ac7b2SJunchao Zhang   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
258076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
259152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2606af1d01cSJed Brown   ierr   = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2618c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2628c3ff71bSJunchao Zhang }
2638c3ff71bSJunchao Zhang 
2648c3ff71bSJunchao Zhang /* y = A^T x */
2658c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2668c3ff71bSJunchao Zhang {
2678c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2688c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
269152b3e56SJunchao Zhang   const char                       *mode;
270152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
271152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
272076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2738c3ff71bSJunchao Zhang 
2748c3ff71bSJunchao Zhang   PetscFunctionBegin;
275eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2768c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
279152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
280076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281152b3e56SJunchao Zhang     mode = "N";
282152b3e56SJunchao Zhang   } else {
283076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
285152b3e56SJunchao Zhang     mode = "T";
286152b3e56SJunchao Zhang   }
287076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
290076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2928c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2938c3ff71bSJunchao Zhang }
2948c3ff71bSJunchao Zhang 
2958c3ff71bSJunchao Zhang /* y = A^H x */
2968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2978c3ff71bSJunchao Zhang {
2988c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2998c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
300152b3e56SJunchao Zhang   const char                       *mode;
301152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
302152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
303076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3048c3ff71bSJunchao Zhang 
3058c3ff71bSJunchao Zhang   PetscFunctionBegin;
306eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
311076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312152b3e56SJunchao Zhang     mode = "N";
313152b3e56SJunchao Zhang   } else {
314076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
316152b3e56SJunchao Zhang     mode = "C";
317152b3e56SJunchao Zhang   }
318076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
321076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3248c3ff71bSJunchao Zhang }
3258c3ff71bSJunchao Zhang 
3268c3ff71bSJunchao Zhang /* z = A x + y */
3278c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3288c3ff71bSJunchao Zhang {
3298c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3308c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
331152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
332152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang   PetscFunctionBegin;
335eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3368c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3408c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3418c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3488c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3498c3ff71bSJunchao Zhang }
3508c3ff71bSJunchao Zhang 
3518c3ff71bSJunchao Zhang /* z = A^T x + y */
3528c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3538c3ff71bSJunchao Zhang {
3548c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3558c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
356152b3e56SJunchao Zhang   const char                       *mode;
357152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
358152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
359076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3608c3ff71bSJunchao Zhang 
3618c3ff71bSJunchao Zhang   PetscFunctionBegin;
362eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3678c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
368152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
369076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370152b3e56SJunchao Zhang     mode = "N";
371152b3e56SJunchao Zhang   } else {
372076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
374152b3e56SJunchao Zhang     mode = "T";
375152b3e56SJunchao Zhang   }
376076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
380076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3838c3ff71bSJunchao Zhang }
3848c3ff71bSJunchao Zhang 
3858c3ff71bSJunchao Zhang /* z = A^H x + y */
3868c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3878c3ff71bSJunchao Zhang {
3888c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3898c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
390152b3e56SJunchao Zhang   const char                       *mode;
391152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
392152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
393076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang   PetscFunctionBegin;
396eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3978c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
4018c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
402152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
403076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404152b3e56SJunchao Zhang     mode = "N";
405152b3e56SJunchao Zhang   } else {
406076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
408152b3e56SJunchao Zhang     mode = "C";
409152b3e56SJunchao Zhang   }
410076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
414076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416152b3e56SJunchao Zhang   PetscFunctionReturn(0);
417152b3e56SJunchao Zhang }
418152b3e56SJunchao Zhang 
419152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420152b3e56SJunchao Zhang {
421152b3e56SJunchao Zhang   PetscErrorCode            ierr;
422152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423152b3e56SJunchao Zhang 
424152b3e56SJunchao Zhang   PetscFunctionBegin;
425152b3e56SJunchao Zhang   switch (op) {
426152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
427152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
430152b3e56SJunchao Zhang       break;
431152b3e56SJunchao Zhang     default:
432152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433152b3e56SJunchao Zhang       break;
434152b3e56SJunchao Zhang   }
4358c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4368c3ff71bSJunchao Zhang }
4378c3ff71bSJunchao Zhang 
438076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4393d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4408c3ff71bSJunchao Zhang {
4418c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
442076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4438c3ff71bSJunchao Zhang 
4448c3ff71bSJunchao Zhang   PetscFunctionBegin;
445a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4488c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4512c71b3e2SJacob Faibussowitsch     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4582c71b3e2SJacob Faibussowitsch       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4608c3ff71bSJunchao Zhang     }
461076ba34aSJunchao Zhang   }
4628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4638c3ff71bSJunchao Zhang }
4648c3ff71bSJunchao Zhang 
465076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467076ba34aSJunchao Zhang  */
468076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4698c3ff71bSJunchao Zhang {
4708c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
471076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
472076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473076ba34aSJunchao Zhang   Mat                   mat;
4748c3ff71bSJunchao Zhang 
4758c3ff71bSJunchao Zhang   PetscFunctionBegin;
476076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478076ba34aSJunchao Zhang   mat  = *B;
479076ba34aSJunchao Zhang   if (A->assembled) {
480076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483076ba34aSJunchao Zhang     /* Now copy values to B if needed */
484076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
485076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
486076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
488076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
489076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
491076ba34aSJunchao Zhang       }
492076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
495076ba34aSJunchao Zhang     }
496076ba34aSJunchao Zhang     mat->spptr = bkok;
497076ba34aSJunchao Zhang   }
498076ba34aSJunchao Zhang 
499076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5048c3ff71bSJunchao Zhang }
5058c3ff71bSJunchao Zhang 
5060ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5070ecb592aSJunchao Zhang {
5080ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5090ecb592aSJunchao Zhang   Mat               At;
510ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5110ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5120ecb592aSJunchao Zhang 
5130ecb592aSJunchao Zhang   PetscFunctionBegin;
5140ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5150ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
517ff751488SJunchao Zhang     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5180ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5190ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5207a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5210ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5220ecb592aSJunchao Zhang     if ((*B)->assembled) {
5230ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5240ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5250ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5260ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5270ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5280ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5290ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5300ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5320ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5330ecb592aSJunchao Zhang   }
5340ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5350ecb592aSJunchao Zhang }
5360ecb592aSJunchao Zhang 
5378c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5388c3ff71bSJunchao Zhang {
5398c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5418c3ff71bSJunchao Zhang 
5428c3ff71bSJunchao Zhang   PetscFunctionBegin;
54386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5458c3ff71bSJunchao Zhang     delete aijkok;
54686a27549SJunchao Zhang   } else {
54786a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54886a27549SJunchao Zhang   }
549*cbc6b225SStefano Zampini   A->spptr = NULL;
550152b3e56SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
55142550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
55242550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
5538c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5558c3ff71bSJunchao Zhang }
5568c3ff71bSJunchao Zhang 
5573f3ba80aSJunchao Zhang /*MC
5583f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5593f3ba80aSJunchao Zhang 
5603f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5613f3ba80aSJunchao Zhang 
5623f3ba80aSJunchao Zhang    Options Database Keys:
5633f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5643f3ba80aSJunchao Zhang 
5653f3ba80aSJunchao Zhang   Level: beginner
5663f3ba80aSJunchao Zhang 
5673f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
5683f3ba80aSJunchao Zhang M*/
56986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
57086a27549SJunchao Zhang {
57186a27549SJunchao Zhang   PetscErrorCode ierr;
57286a27549SJunchao Zhang 
57386a27549SJunchao Zhang   PetscFunctionBegin;
57486a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
57586a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
57686a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
57786a27549SJunchao Zhang   PetscFunctionReturn(0);
57886a27549SJunchao Zhang }
57986a27549SJunchao Zhang 
580076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582a3f881fbSStefano Zampini {
583076ba34aSJunchao Zhang   PetscErrorCode               ierr;
584076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
585076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
586076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
587076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
588076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
589076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
590a3f881fbSStefano Zampini 
591a3f881fbSStefano Zampini   PetscFunctionBegin;
592076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
593076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
594076ba34aSJunchao Zhang   PetscValidPointer(C,4);
595076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
596076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
5972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
599076ba34aSJunchao Zhang 
600076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
601076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
602076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
603076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
604076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
605076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
606076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
607076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
608076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
609076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
610076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
611076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
612076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
613076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
614076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
615076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
616076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
617076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
618076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
619076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
620076ba34aSJunchao Zhang     ca = ca_dual.view_device();
621076ba34aSJunchao Zhang     ci = ci_dual.view_device();
622076ba34aSJunchao Zhang     cj = cj_dual.view_device();
623076ba34aSJunchao Zhang 
624076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
625076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
626076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
627076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
628076ba34aSJunchao Zhang 
629076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
630076ba34aSJunchao Zhang         ci(i) = coffset;
631076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
632076ba34aSJunchao Zhang       });
633076ba34aSJunchao Zhang 
634076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
635076ba34aSJunchao Zhang         if (k < alen) {
636076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
637076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
638076ba34aSJunchao Zhang         } else {
639076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
640076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
641076ba34aSJunchao Zhang         }
642076ba34aSJunchao Zhang       });
643076ba34aSJunchao Zhang     });
644076ba34aSJunchao Zhang     ca_dual.modify_device();
645076ba34aSJunchao Zhang     ci_dual.modify_device();
646076ba34aSJunchao Zhang     cj_dual.modify_device();
647076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
648076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
649076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
650076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
651076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
652076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
653076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
654076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
655076ba34aSJunchao Zhang 
656076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
657076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
658076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
659076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
660076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
661076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
662076ba34aSJunchao Zhang       });
663076ba34aSJunchao Zhang     });
664076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
665076ba34aSJunchao Zhang   }
666076ba34aSJunchao Zhang   PetscFunctionReturn(0);
667076ba34aSJunchao Zhang }
668076ba34aSJunchao Zhang 
669076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
670076ba34aSJunchao Zhang {
671076ba34aSJunchao Zhang   PetscFunctionBegin;
672076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
673a3f881fbSStefano Zampini   PetscFunctionReturn(0);
674a3f881fbSStefano Zampini }
675a3f881fbSStefano Zampini 
676a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
677a3f881fbSStefano Zampini {
678076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
679a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
680a3f881fbSStefano Zampini   Mat                            A,B;
681076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
682a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
683a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
684076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
685076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
686a3f881fbSStefano Zampini 
687a3f881fbSStefano Zampini   PetscFunctionBegin;
688a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6892c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
690076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
691076ba34aSJunchao Zhang 
692076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
693076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
694076ba34aSJunchao Zhang     PetscFunctionReturn(0);
695076ba34aSJunchao Zhang   }
696076ba34aSJunchao Zhang 
697076ba34aSJunchao Zhang   switch (product->type) {
698076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
699076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
700076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
701076ba34aSJunchao Zhang     default:
70298921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
703076ba34aSJunchao Zhang   }
704076ba34aSJunchao Zhang 
705a3f881fbSStefano Zampini   A     = product->A;
706a3f881fbSStefano Zampini   B     = product->B;
707a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
708a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
709a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
710a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
711a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
712076ba34aSJunchao Zhang 
7132c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
714076ba34aSJunchao Zhang 
715076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
716076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
717076ba34aSJunchao Zhang 
718076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
719076ba34aSJunchao Zhang   if (transA) {
720076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
721076ba34aSJunchao Zhang     transA = false;
722a3f881fbSStefano Zampini   }
723a3f881fbSStefano Zampini 
724076ba34aSJunchao Zhang   if (transB) {
725076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
726076ba34aSJunchao Zhang     transB = false;
727076ba34aSJunchao Zhang   }
728eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
729076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
730076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
731eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
732076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
733a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
734a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7357d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
736a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
7377d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
738a3f881fbSStefano Zampini   c->reallocs         = 0;
739076ba34aSJunchao Zhang   C->info.mallocs     = 0;
740a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
741a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
742a3f881fbSStefano Zampini   C->num_ass++;
743a3f881fbSStefano Zampini   PetscFunctionReturn(0);
744a3f881fbSStefano Zampini }
745a3f881fbSStefano Zampini 
746a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
747a3f881fbSStefano Zampini {
748a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
749076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
750076ba34aSJunchao Zhang   MatProductType                 ptype;
751076ba34aSJunchao Zhang   Mat                            A,B;
752076ba34aSJunchao Zhang   bool                           transA,transB;
753076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
754076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
755076ba34aSJunchao Zhang   MPI_Comm                       comm;
756076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
757a3f881fbSStefano Zampini 
758a3f881fbSStefano Zampini   PetscFunctionBegin;
759a3f881fbSStefano Zampini   MatCheckProduct(C,1);
760076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
7612c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
762a3f881fbSStefano Zampini   A       = product->A;
763a3f881fbSStefano Zampini   B       = product->B;
764a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
765a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
766a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
767a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
768076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
769076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
770076ba34aSJunchao Zhang 
771a3f881fbSStefano Zampini   ptype   = product->type;
772a3f881fbSStefano Zampini   switch (ptype) {
773076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
774076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
775076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
776a3f881fbSStefano Zampini     default:
77798921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
778a3f881fbSStefano Zampini   }
779a3f881fbSStefano Zampini 
780076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
781076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
782076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
783076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
784a3f881fbSStefano Zampini 
785076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
786076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7876ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7886ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7896ffb9508SJunchao Zhang    */
790076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
791076ba34aSJunchao Zhang 
792eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
793076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
794076ba34aSJunchao Zhang   if (transA) {
795076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
796076ba34aSJunchao Zhang     transA = false;
797076ba34aSJunchao Zhang   }
798076ba34aSJunchao Zhang 
799076ba34aSJunchao Zhang   if (transB) {
800076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
801076ba34aSJunchao Zhang     transB = false;
802076ba34aSJunchao Zhang   }
803076ba34aSJunchao Zhang 
804076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
805076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
806076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
807076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
808076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
809076ba34aSJunchao Zhang   */
810076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
811076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
812eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
813076ba34aSJunchao Zhang 
814076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
815076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
816076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
817a3f881fbSStefano Zampini   PetscFunctionReturn(0);
818a3f881fbSStefano Zampini }
819a3f881fbSStefano Zampini 
820a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
821076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
822a3f881fbSStefano Zampini {
823a3f881fbSStefano Zampini   PetscErrorCode ierr;
824076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
825a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
826a3f881fbSStefano Zampini 
827a3f881fbSStefano Zampini   PetscFunctionBegin;
828a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
829a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
830a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
831a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
832a3f881fbSStefano Zampini   }
833a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
834a3f881fbSStefano Zampini     switch (product->type) {
835a3f881fbSStefano Zampini     case MATPRODUCT_AB:
836a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
837a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
838a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
839a3f881fbSStefano Zampini       break;
840a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
841a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
842a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
843a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
844a3f881fbSStefano Zampini       break;
845a3f881fbSStefano Zampini     default:
846a3f881fbSStefano Zampini       break;
847a3f881fbSStefano Zampini     }
848a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
849a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
850a3f881fbSStefano Zampini   }
851a3f881fbSStefano Zampini   PetscFunctionReturn(0);
852a3f881fbSStefano Zampini }
853a587d139SMark 
854f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
855f0cf5187SStefano Zampini {
856f0cf5187SStefano Zampini   PetscErrorCode   ierr;
857f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
858f0cf5187SStefano Zampini 
859f0cf5187SStefano Zampini   PetscFunctionBegin;
860eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
861f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
862f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
863076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
864076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
865076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
8666af1d01cSJed Brown   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
867f0cf5187SStefano Zampini   PetscFunctionReturn(0);
868f0cf5187SStefano Zampini }
869f0cf5187SStefano Zampini 
870a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
871a587d139SMark {
872a587d139SMark   PetscErrorCode   ierr;
873076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
874a587d139SMark 
875a587d139SMark   PetscFunctionBegin;
876076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8772328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
878076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
879076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8802328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8812328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8822328674fSJunchao Zhang   }
883a587d139SMark   PetscFunctionReturn(0);
884a587d139SMark }
885a587d139SMark 
886db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
88742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
888db78de30SJunchao Zhang {
889db78de30SJunchao Zhang   PetscErrorCode     ierr;
890db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
891db78de30SJunchao Zhang 
892db78de30SJunchao Zhang   PetscFunctionBegin;
893db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
894db78de30SJunchao Zhang   PetscValidPointer(kv,2);
895db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
896db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
897db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
898076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
899db78de30SJunchao Zhang   PetscFunctionReturn(0);
900db78de30SJunchao Zhang }
901db78de30SJunchao Zhang 
90242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
903db78de30SJunchao Zhang {
904db78de30SJunchao Zhang   PetscFunctionBegin;
905db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
906db78de30SJunchao Zhang   PetscValidPointer(kv,2);
907db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
908db78de30SJunchao Zhang   PetscFunctionReturn(0);
909db78de30SJunchao Zhang }
910db78de30SJunchao Zhang 
91142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
912db78de30SJunchao Zhang {
913db78de30SJunchao Zhang   PetscErrorCode     ierr;
914db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
915db78de30SJunchao Zhang 
916db78de30SJunchao Zhang   PetscFunctionBegin;
917db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
918db78de30SJunchao Zhang   PetscValidPointer(kv,2);
919db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
920db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
921db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
922076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
923db78de30SJunchao Zhang   PetscFunctionReturn(0);
924db78de30SJunchao Zhang }
925db78de30SJunchao Zhang 
92642550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
927db78de30SJunchao Zhang {
928db78de30SJunchao Zhang   PetscErrorCode     ierr;
929db78de30SJunchao Zhang 
930db78de30SJunchao Zhang   PetscFunctionBegin;
931db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
932db78de30SJunchao Zhang   PetscValidPointer(kv,2);
933db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
934076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
935db78de30SJunchao Zhang   PetscFunctionReturn(0);
936db78de30SJunchao Zhang }
937db78de30SJunchao Zhang 
93842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
939db78de30SJunchao Zhang {
940db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
941db78de30SJunchao Zhang 
942db78de30SJunchao Zhang   PetscFunctionBegin;
943db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
944db78de30SJunchao Zhang   PetscValidPointer(kv,2);
945db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
946db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
947076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
948db78de30SJunchao Zhang   PetscFunctionReturn(0);
949db78de30SJunchao Zhang }
950db78de30SJunchao Zhang 
95142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
952db78de30SJunchao Zhang {
953db78de30SJunchao Zhang   PetscErrorCode     ierr;
954db78de30SJunchao Zhang 
955db78de30SJunchao Zhang   PetscFunctionBegin;
956db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
957db78de30SJunchao Zhang   PetscValidPointer(kv,2);
958db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
959076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
960db78de30SJunchao Zhang   PetscFunctionReturn(0);
961db78de30SJunchao Zhang }
962db78de30SJunchao Zhang 
963c17cf699SJunchao Zhang /* Computes Y += alpha X */
964c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
965a587d139SMark {
966a587d139SMark   PetscErrorCode             ierr;
967a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
968c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
969c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
970c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
971a587d139SMark 
972a587d139SMark   PetscFunctionBegin;
973c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
974c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
975c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
976c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
977eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
978db78de30SJunchao Zhang 
979c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
980c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
981a587d139SMark     PetscBool e;
982a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
983a587d139SMark     if (e) {
984a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
985c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
986a587d139SMark     }
987a587d139SMark   }
988db78de30SJunchao Zhang 
989c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
990c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
991c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
992c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
993c17cf699SJunchao Zhang   */
994c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
995c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
996c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
997c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
998c17cf699SJunchao Zhang 
999c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1000c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
1001c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1002c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1003c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1004c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1005c17cf699SJunchao Zhang 
1006c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1007c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1008c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1009c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1010c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1011c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1012c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1013c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1014c17cf699SJunchao Zhang             q++;
1015a587d139SMark           } else {
1016c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1017c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1018c17cf699SJunchao Zhang             */
1019c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1020a587d139SMark           }
1021c17cf699SJunchao Zhang         }
1022c17cf699SJunchao Zhang       });
1023c17cf699SJunchao Zhang     });
1024c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1025c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1026c17cf699SJunchao Zhang     Mat             Z;
1027c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1028c17cf699SJunchao Zhang     KernelHandle    kh;
1029c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1030c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1031c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1032c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1033c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1034c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1035c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1036c17cf699SJunchao Zhang   }
1037eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1038eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1039a587d139SMark   PetscFunctionReturn(0);
1040a587d139SMark }
1041a587d139SMark 
1042394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
104342550becSJunchao Zhang {
104442550becSJunchao Zhang   PetscErrorCode            ierr;
104542550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
104642550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
104742550becSJunchao Zhang 
104842550becSJunchao Zhang   PetscFunctionBegin;
1049*cbc6b225SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
1050394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
105142550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1052*cbc6b225SStefano Zampini   delete akok;
1053*cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1054*cbc6b225SStefano Zampini   ierr = MatZeroEntries_SeqAIJKokkos(mat);CHKERRQ(ierr);
1055394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
105642550becSJunchao Zhang   PetscFunctionReturn(0);
105742550becSJunchao Zhang }
105842550becSJunchao Zhang 
105942550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
106042550becSJunchao Zhang {
106142550becSJunchao Zhang   PetscErrorCode              ierr;
106242550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
106342550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1064394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1065394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1066394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
106742550becSJunchao Zhang   MatScalarKokkosView         Aa;
106842550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
106942550becSJunchao Zhang   PetscMemType                memtype;
107042550becSJunchao Zhang 
107142550becSJunchao Zhang   PetscFunctionBegin;
107242550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
107342550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1074394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
107542550becSJunchao Zhang   } else {
1076394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
107742550becSJunchao Zhang   }
107842550becSJunchao Zhang 
1079*cbc6b225SStefano Zampini   if (imode == INSERT_VALUES) {
1080*cbc6b225SStefano Zampini     ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
1081*cbc6b225SStefano Zampini     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1082*cbc6b225SStefano Zampini   } else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
108342550becSJunchao Zhang 
1084*cbc6b225SStefano Zampini   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1085394ed5ebSJunchao Zhang 
1086394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1087394ed5ebSJunchao Zhang   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
108842550becSJunchao Zhang   PetscFunctionReturn(0);
108942550becSJunchao Zhang }
109042550becSJunchao Zhang 
109186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10928f7e8f9dSMark Adams {
10938f7e8f9dSMark Adams   PetscErrorCode ierr;
10948f7e8f9dSMark Adams 
10958f7e8f9dSMark Adams   PetscFunctionBegin;
10968f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
10978f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
10988f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10998f7e8f9dSMark Adams   PetscFunctionReturn(0);
11008f7e8f9dSMark Adams }
11018f7e8f9dSMark Adams 
11028c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11038c3ff71bSJunchao Zhang {
110442550becSJunchao Zhang   PetscErrorCode     ierr;
1105076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1106076ba34aSJunchao Zhang 
11078c3ff71bSJunchao Zhang   PetscFunctionBegin;
1108076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11096f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11106f3d89d0SStefano Zampini 
11118c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11128c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11138c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1114a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1115f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1116a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1117076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11188c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11198c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11208c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11218c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11228c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11238c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1124076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11250ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1126152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1127076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1128076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1129076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1130076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1131076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1132076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
113342550becSJunchao Zhang 
113442550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
113542550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1136076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1137076ba34aSJunchao Zhang }
1138076ba34aSJunchao Zhang 
1139076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1140076ba34aSJunchao Zhang {
1141076ba34aSJunchao Zhang   PetscErrorCode ierr;
1142076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1143076ba34aSJunchao Zhang   PetscInt       i,m,n;
1144076ba34aSJunchao Zhang 
1145076ba34aSJunchao Zhang   PetscFunctionBegin;
11462c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1147076ba34aSJunchao Zhang 
1148076ba34aSJunchao Zhang   m    = akok->nrows();
1149076ba34aSJunchao Zhang   n    = akok->ncols();
1150076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1151076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1152076ba34aSJunchao Zhang 
1153076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1154076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1155076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1156076ba34aSJunchao Zhang 
1157076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1158076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1159076ba34aSJunchao Zhang 
1160076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1161076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1162076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1163076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1164076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1165076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1166076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1167076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1168076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1169076ba34aSJunchao Zhang 
1170076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1171076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1172076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1173076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1174076ba34aSJunchao Zhang   }
1175076ba34aSJunchao Zhang 
1176076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1177076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1178ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1179076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1180076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1181076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1182076ba34aSJunchao Zhang }
1183076ba34aSJunchao Zhang 
1184076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1185076ba34aSJunchao Zhang 
1186076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1187076ba34aSJunchao Zhang  */
1188076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1189076ba34aSJunchao Zhang {
1190076ba34aSJunchao Zhang   PetscErrorCode ierr;
1191076ba34aSJunchao Zhang 
1192076ba34aSJunchao Zhang   PetscFunctionBegin;
1193076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1194076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
11958c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11968c3ff71bSJunchao Zhang }
11978c3ff71bSJunchao Zhang 
11988c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1199152b3e56SJunchao Zhang /*@C
12008c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12018c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12028c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12038c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12048c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12058c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12068c3ff71bSJunchao Zhang 
12078c3ff71bSJunchao Zhang    Collective
12088c3ff71bSJunchao Zhang 
12098c3ff71bSJunchao Zhang    Input Parameters:
12108c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12118c3ff71bSJunchao Zhang .  m - number of rows
12128c3ff71bSJunchao Zhang .  n - number of columns
12138c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12148c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12158c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12168c3ff71bSJunchao Zhang 
12178c3ff71bSJunchao Zhang    Output Parameter:
12188c3ff71bSJunchao Zhang .  A - the matrix
12198c3ff71bSJunchao Zhang 
12208c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12218c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12228c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12238c3ff71bSJunchao Zhang 
12248c3ff71bSJunchao Zhang    Notes:
12258c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12268c3ff71bSJunchao Zhang 
12278c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12288c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12298c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12308c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12318c3ff71bSJunchao Zhang 
12328c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12338c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12348c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12358c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12368c3ff71bSJunchao Zhang 
12378c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12388c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12398c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12408c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12418c3ff71bSJunchao Zhang 
12428c3ff71bSJunchao Zhang    Level: intermediate
12438c3ff71bSJunchao Zhang 
1244076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
12458c3ff71bSJunchao Zhang @*/
12468c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12478c3ff71bSJunchao Zhang {
12488c3ff71bSJunchao Zhang   PetscErrorCode ierr;
12498c3ff71bSJunchao Zhang 
12508c3ff71bSJunchao Zhang   PetscFunctionBegin;
12518c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
12528c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
12538c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
12548c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
12558c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
12568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12578c3ff71bSJunchao Zhang }
1258930e68a5SMark Adams 
12598f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12608f7e8f9dSMark Adams //
126146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12628f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12638f7e8f9dSMark Adams //
12648f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1265930e68a5SMark Adams {
12668f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12678f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12688f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1269930e68a5SMark Adams   PetscErrorCode     ierr;
12708f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1271300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1272076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1273076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12748f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
127546804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1276930e68a5SMark Adams 
1277930e68a5SMark Adams   PetscFunctionBegin;
12782c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12798f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
12802c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12818f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12828f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12838f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12848f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12858f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12868f7e8f9dSMark Adams   {
12878f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12888f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12898f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12908f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12918f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12928f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12938f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12948f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12958f7e8f9dSMark Adams     size_t flops_h = 0.0;
12968f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12978f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12988f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12998f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13008f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13018f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13028f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13038f7e8f9dSMark Adams     // Fill A --> fact
13048f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1305042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13068f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13078f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13088f7e8f9dSMark Adams         // zero rows of B
13098f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13108f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13118f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13128f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13138f7e8f9dSMark Adams             /* zero (unfactored row) */
13148f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13158f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13168f7e8f9dSMark Adams           });
13178f7e8f9dSMark Adams         // copy A into B
13188f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13198f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13208f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13218f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13228f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13238f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13248f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13258f7e8f9dSMark Adams               PetscScalar vala = av[j];
13268f7e8f9dSMark Adams               if (colb == rowb) {
13278f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13288f7e8f9dSMark Adams               } else {
13298f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13308f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13318f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13328f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13338f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13348f7e8f9dSMark Adams                     pba[j] = vala;
13358f7e8f9dSMark Adams                     set++;
13368f7e8f9dSMark Adams                     break;
13378f7e8f9dSMark Adams                   }
13388f7e8f9dSMark Adams                 }
13398f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13408f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13418f1da0b2SJunchao Zhang                #endif
13428f7e8f9dSMark Adams               }
13438f7e8f9dSMark Adams             }
13448f7e8f9dSMark Adams           });
13458f7e8f9dSMark Adams       });
13468f7e8f9dSMark Adams     Kokkos::fence();
1347930e68a5SMark Adams 
13488f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13498f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13508f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13518f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1352042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13538f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13548f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13558f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13568f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13578f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13588f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13598f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13608f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13618f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13628f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13638f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13648f7e8f9dSMark Adams               else {
13658f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13668f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13678f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13688f7e8f9dSMark Adams                 size_t         st_idx;
13698f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13708f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13718f7e8f9dSMark Adams                 // get column, there has got to be a better way
13728f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13738f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13748f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13758f7e8f9dSMark Adams                       idx = j; // output
13768f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13778f7e8f9dSMark Adams                     }
13788f7e8f9dSMark Adams                 }, st_idx);
13798f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13808f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
138199551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
138299551766SMark Adams #endif
138399551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13848f7e8f9dSMark Adams                 // U(i+1,:end)
13858f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13868f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13878f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13888f7e8f9dSMark Adams                       if (col==myk) {
13898f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13908f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13918f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13928f7e8f9dSMark Adams                       } else {
13938f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13948f7e8f9dSMark Adams                         PetscInt       high, low;
13958f7e8f9dSMark Adams                         const PetscInt *startj;
13968f7e8f9dSMark Adams                         if (col<myk) { // L
13978f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13988f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13998f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14008f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14018f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14028f7e8f9dSMark Adams                         } else {
14038f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14048f7e8f9dSMark Adams                           start = ba_d + idx;
14058f7e8f9dSMark Adams                           startj= bj_d + idx;
14068f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14078f7e8f9dSMark Adams                         }
14088f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14098f7e8f9dSMark Adams                         low  = 0;
14108f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14118f7e8f9dSMark Adams                         while (high-low > 5) {
14128f7e8f9dSMark Adams                           int t = (low+high)/2;
14138f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14148f7e8f9dSMark Adams                           else                 low  = t;
14158f7e8f9dSMark Adams                         }
14168f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14178f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14188f7e8f9dSMark Adams                         }
14198f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
142099551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
142199551766SMark Adams #endif
14228f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14238f7e8f9dSMark Adams                       }
14248f7e8f9dSMark Adams                     });
14258f7e8f9dSMark Adams               }
14268f7e8f9dSMark Adams             });
14278f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14288f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14298f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14308f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14318f7e8f9dSMark Adams       });
14328f7e8f9dSMark Adams     Kokkos::fence();
14338f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14348f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
14358f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14368f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14378f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14388f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14398f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14408f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14418f7e8f9dSMark Adams             if (i < end) {
14428f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14438f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14448f7e8f9dSMark Adams             }
14458f7e8f9dSMark Adams           });
14468f7e8f9dSMark Adams       });
14478f7e8f9dSMark Adams   }
14488f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
14498f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
14508f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
14518f7e8f9dSMark Adams 
14528f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
14538f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
14548f7e8f9dSMark Adams   if (b->inode.size) {
14558f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14568f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14578f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14588f7e8f9dSMark Adams   } else {
14598f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14608f7e8f9dSMark Adams   }
14618f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14628f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
14638f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14648f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14658f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14668f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14678f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14688f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14698f7e8f9dSMark Adams 
1470930e68a5SMark Adams   PetscFunctionReturn(0);
1471930e68a5SMark Adams }
1472930e68a5SMark Adams 
147386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1474930e68a5SMark Adams {
1475930e68a5SMark Adams   PetscErrorCode   ierr;
1476930e68a5SMark Adams 
1477930e68a5SMark Adams   PetscFunctionBegin;
1478930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
147986a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
148086a27549SJunchao Zhang   PetscFunctionReturn(0);
148186a27549SJunchao Zhang }
148286a27549SJunchao Zhang 
148386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
148486a27549SJunchao Zhang {
148586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
148686a27549SJunchao Zhang 
148786a27549SJunchao Zhang   PetscFunctionBegin;
148886a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
148986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
149086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
149186a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
149286a27549SJunchao Zhang   }
149386a27549SJunchao Zhang   PetscFunctionReturn(0);
149486a27549SJunchao Zhang }
149586a27549SJunchao Zhang 
149686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
149786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
149886a27549SJunchao Zhang {
149986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1500076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
150186a27549SJunchao Zhang 
150286a27549SJunchao Zhang   PetscFunctionBegin;
150386a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
150486a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1505076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
150686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1507076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1508076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
150986a27549SJunchao Zhang 
151086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1511076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1512076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1513076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
151486a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
151586a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
151686a27549SJunchao Zhang 
151786a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
151886a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
151986a27549SJunchao Zhang     */
1520076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1521076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
152286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
152386a27549SJunchao Zhang 
152486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
152586a27549SJunchao Zhang 
152686a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1527076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
152886a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1529076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1530076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
153186a27549SJunchao Zhang 
153286a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1533076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1534076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1535076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
153686a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
153786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
153886a27549SJunchao Zhang 
153986a27549SJunchao Zhang     /* Sort indices. See comments above */
1540076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1541076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
154286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
154386a27549SJunchao Zhang 
154486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
154586a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
154686a27549SJunchao Zhang   }
154786a27549SJunchao Zhang   PetscFunctionReturn(0);
154886a27549SJunchao Zhang }
154986a27549SJunchao Zhang 
155086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
155186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
155286a27549SJunchao Zhang {
155386a27549SJunchao Zhang   PetscErrorCode                 ierr;
155486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
155586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
155686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
155786a27549SJunchao Zhang 
155886a27549SJunchao Zhang   PetscFunctionBegin;
1559eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
156186a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1562ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
156386a27549SJunchao Zhang   /* Solve L tmpv = b */
15643f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
156586a27549SJunchao Zhang   /* Solve Ux = tmpv */
15663f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
156786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1568ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1569eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
157086a27549SJunchao Zhang   PetscFunctionReturn(0);
157186a27549SJunchao Zhang }
157286a27549SJunchao Zhang 
1573076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
157486a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
157586a27549SJunchao Zhang {
157686a27549SJunchao Zhang   PetscErrorCode                 ierr;
157786a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
157886a27549SJunchao Zhang   PetscScalarKokkosView          xv;
157986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
158086a27549SJunchao Zhang 
158186a27549SJunchao Zhang   PetscFunctionBegin;
1582eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
158386a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
158486a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1585ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
158686a27549SJunchao Zhang   /* Solve U^T tmpv = b */
158786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
158886a27549SJunchao Zhang 
158986a27549SJunchao Zhang   /* Solve L^T x = tmpv */
159086a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
159186a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1592ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1593eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
159486a27549SJunchao Zhang   PetscFunctionReturn(0);
159586a27549SJunchao Zhang }
159686a27549SJunchao Zhang 
159786a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
159886a27549SJunchao Zhang {
159986a27549SJunchao Zhang   PetscErrorCode                 ierr;
160086a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
160186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
160286a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
160386a27549SJunchao Zhang 
160486a27549SJunchao Zhang   PetscFunctionBegin;
1605eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
160686a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1607076ba34aSJunchao Zhang 
1608076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1609076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1610076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1611076ba34aSJunchao Zhang 
1612076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
161386a27549SJunchao Zhang 
161486a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
161586a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
161686a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
161786a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
161886a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
161986a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
162086a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
162186a27549SJunchao Zhang 
162286a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
162386a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
162486a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1625eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1626eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
162786a27549SJunchao Zhang   PetscFunctionReturn(0);
162886a27549SJunchao Zhang }
162986a27549SJunchao Zhang 
163086a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
163186a27549SJunchao Zhang {
163286a27549SJunchao Zhang   PetscErrorCode                 ierr;
163386a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
163486a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
163586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
163686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
163786a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
163886a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
163986a27549SJunchao Zhang 
164086a27549SJunchao Zhang   PetscFunctionBegin;
164186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
164286a27549SJunchao Zhang   /* Rebuild factors */
164386a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
164486a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
164586a27549SJunchao Zhang 
164686a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
164786a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
164886a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
164986a27549SJunchao Zhang 
165086a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
165186a27549SJunchao Zhang 
165286a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
165386a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
165486a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
165586a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
165686a27549SJunchao Zhang 
165786a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1658076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1659076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1660076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
166186a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
166286a27549SJunchao Zhang 
166386a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
166486a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
166586a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
166686a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
166786a27549SJunchao Zhang 
166886a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
166986a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
167086a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
167186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
167286a27549SJunchao Zhang  #else
167386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
167486a27549SJunchao Zhang  #endif
167586a27549SJunchao Zhang 
167686a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
167786a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
167886a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
167986a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
168086a27549SJunchao Zhang 
168186a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
168286a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
168386a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
168486a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
168586a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
168686a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
168786a27549SJunchao Zhang 
168886a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
168986a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1690930e68a5SMark Adams   PetscFunctionReturn(0);
1691930e68a5SMark Adams }
1692930e68a5SMark Adams 
16938f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16948f7e8f9dSMark Adams {
16958f7e8f9dSMark Adams   PetscErrorCode   ierr;
16968f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16978f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1698930e68a5SMark Adams 
16998f7e8f9dSMark Adams   PetscFunctionBegin;
17008f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
17018f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17028f7e8f9dSMark Adams   // move B data into Kokkos
17038f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
17048f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
17058f7e8f9dSMark Adams   {
17068f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1707300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17088f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1709300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1710300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17118f7e8f9dSMark Adams     }
17128f7e8f9dSMark Adams   }
17138f7e8f9dSMark Adams   PetscFunctionReturn(0);
17148f7e8f9dSMark Adams }
17158f7e8f9dSMark Adams 
171686a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1717930e68a5SMark Adams {
1718930e68a5SMark Adams   PetscFunctionBegin;
1719930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1720930e68a5SMark Adams   PetscFunctionReturn(0);
1721930e68a5SMark Adams }
1722930e68a5SMark Adams 
17238f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17248f7e8f9dSMark Adams {
17258f7e8f9dSMark Adams   PetscFunctionBegin;
17268f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17278f7e8f9dSMark Adams   PetscFunctionReturn(0);
17288f7e8f9dSMark Adams }
17298f7e8f9dSMark Adams 
1730930e68a5SMark Adams /*MC
173186a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
173286a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1733930e68a5SMark Adams 
1734930e68a5SMark Adams   Level: beginner
1735930e68a5SMark Adams 
17363f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1737930e68a5SMark Adams M*/
173886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1739930e68a5SMark Adams {
1740930e68a5SMark Adams   PetscErrorCode ierr;
1741930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1742930e68a5SMark Adams 
1743930e68a5SMark Adams   PetscFunctionBegin;
1744930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1745930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1746930e68a5SMark Adams   (*B)->factortype = ftype;
17474ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1748930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1749930e68a5SMark Adams 
17508f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1751930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
175286a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
175386a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
175486a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
175586a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
175686a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
175786a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
175898921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1759930e68a5SMark Adams 
1760930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
176186a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1762930e68a5SMark Adams   PetscFunctionReturn(0);
1763930e68a5SMark Adams }
17648f7e8f9dSMark Adams 
17658f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17668f7e8f9dSMark Adams {
17678f7e8f9dSMark Adams   PetscErrorCode ierr;
17688f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17698f7e8f9dSMark Adams 
17708f7e8f9dSMark Adams   PetscFunctionBegin;
17718f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
17728f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
17738f7e8f9dSMark Adams   (*B)->factortype = ftype;
1774f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17754ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
17768f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17778f7e8f9dSMark Adams 
17788f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17798f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17808f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17818f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17828f7e8f9dSMark Adams 
17838f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17848f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17858f7e8f9dSMark Adams   PetscFunctionReturn(0);
17868f7e8f9dSMark Adams }
178786a27549SJunchao Zhang 
178886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
178986a27549SJunchao Zhang {
179086a27549SJunchao Zhang   PetscErrorCode ierr;
179186a27549SJunchao Zhang 
179286a27549SJunchao Zhang   PetscFunctionBegin;
179386a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
179486a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
179586a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
179686a27549SJunchao Zhang   PetscFunctionReturn(0);
179786a27549SJunchao Zhang }
179886a27549SJunchao Zhang 
1799076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1800076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1801076ba34aSJunchao Zhang {
1802076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1803076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1804076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1805076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1806076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1807076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1808076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1809076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1810076ba34aSJunchao Zhang 
1811076ba34aSJunchao Zhang   PetscFunctionBegin;
1812c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1813076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1814c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1815076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1816c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1817076ba34aSJunchao Zhang     }
1818076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1819076ba34aSJunchao Zhang   }
1820076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1821076ba34aSJunchao Zhang }
1822