xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 7fb60732ccf1806988e9909cd147cdb09c395756)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
28076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
29076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang   PetscFunctionBegin;
32076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
339566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34076ba34aSJunchao Zhang 
35076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37076ba34aSJunchao Zhang 
38076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
39076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
41076ba34aSJunchao Zhang   */
42076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43076ba34aSJunchao Zhang     delete aijkok;
44076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45076ba34aSJunchao Zhang     A->spptr = aijkok;
46076ba34aSJunchao Zhang   }
47076ba34aSJunchao Zhang 
485519a089SJose E. Roman   if (aijkok->device_mat_d.data()) {
49a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50a587d139SMark   }
518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
528c3ff71bSJunchao Zhang }
538c3ff71bSJunchao Zhang 
5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */
55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
568c3ff71bSJunchao Zhang {
578c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
588c3ff71bSJunchao Zhang 
598c3ff71bSJunchao Zhang   PetscFunctionBegin;
605f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
615f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
63076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
64580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
668c3ff71bSJunchao Zhang   }
678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
688c3ff71bSJunchao Zhang }
698c3ff71bSJunchao Zhang 
70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7286a27549SJunchao Zhang {
7386a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7486a27549SJunchao Zhang 
7586a27549SJunchao Zhang   PetscFunctionBegin;
765f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
7786a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
7886a27549SJunchao Zhang   aijkok->a_dual.modify_device();
7986a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8086a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
829566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
8386a27549SJunchao Zhang   PetscFunctionReturn(0);
8486a27549SJunchao Zhang }
8586a27549SJunchao Zhang 
86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87f0cf5187SStefano Zampini {
88f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89f0cf5187SStefano Zampini 
90f0cf5187SStefano Zampini   PetscFunctionBegin;
91f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9286a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
935f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
945f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
96f0cf5187SStefano Zampini   PetscFunctionReturn(0);
97f0cf5187SStefano Zampini }
98f0cf5187SStefano Zampini 
99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100f0cf5187SStefano Zampini {
101076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini   PetscFunctionBegin;
1045519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1055519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1065519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1075519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1085519a089SJose E. Roman   */
1095519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
110076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
111076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
112076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
113076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
114076ba34aSJunchao Zhang   }
115076ba34aSJunchao Zhang   PetscFunctionReturn(0);
116076ba34aSJunchao Zhang }
117076ba34aSJunchao Zhang 
118076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
119076ba34aSJunchao Zhang {
120076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
121076ba34aSJunchao Zhang 
122076ba34aSJunchao Zhang   PetscFunctionBegin;
1235519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
124076ba34aSJunchao Zhang   PetscFunctionReturn(0);
125076ba34aSJunchao Zhang }
126076ba34aSJunchao Zhang 
127076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
128076ba34aSJunchao Zhang {
129076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130076ba34aSJunchao Zhang 
131076ba34aSJunchao Zhang   PetscFunctionBegin;
1325519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
133076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
134076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1352328674fSJunchao Zhang   } else {
1362328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1372328674fSJunchao Zhang   }
138076ba34aSJunchao Zhang   PetscFunctionReturn(0);
139076ba34aSJunchao Zhang }
140076ba34aSJunchao Zhang 
141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
142076ba34aSJunchao Zhang {
143076ba34aSJunchao Zhang   PetscFunctionBegin;
144076ba34aSJunchao Zhang   *array = NULL;
145076ba34aSJunchao Zhang   PetscFunctionReturn(0);
146076ba34aSJunchao Zhang }
147076ba34aSJunchao Zhang 
148076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
149076ba34aSJunchao Zhang {
150076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
151076ba34aSJunchao Zhang 
152076ba34aSJunchao Zhang   PetscFunctionBegin;
1535519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1552328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1562328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1572328674fSJunchao Zhang   }
158076ba34aSJunchao Zhang   PetscFunctionReturn(0);
159076ba34aSJunchao Zhang }
160076ba34aSJunchao Zhang 
161076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
162076ba34aSJunchao Zhang {
163076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164076ba34aSJunchao Zhang 
165076ba34aSJunchao Zhang   PetscFunctionBegin;
1665519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
167076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
168076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1692328674fSJunchao Zhang   }
170f0cf5187SStefano Zampini   PetscFunctionReturn(0);
171f0cf5187SStefano Zampini }
172f0cf5187SStefano Zampini 
1737ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1747ee59b9bSJunchao Zhang {
1757ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1767ee59b9bSJunchao Zhang 
1777ee59b9bSJunchao Zhang   PetscFunctionBegin;
1787ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
1797ee59b9bSJunchao Zhang 
1807ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1817ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
1827ee59b9bSJunchao Zhang   if (a) {
1837ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
1847ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
1857ee59b9bSJunchao Zhang   }
1867ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
1877ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
1887ee59b9bSJunchao Zhang }
1897ee59b9bSJunchao Zhang 
190a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
191042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
192a587d139SMark {
193042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
194042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
195a587d139SMark 
196a587d139SMark   PetscFunctionBegin;
1975f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
198152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
199a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
200a587d139SMark   PetscFunctionReturn(0);
201a587d139SMark }
202a587d139SMark 
203a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
204042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
205a587d139SMark {
206042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
207a587d139SMark 
208a587d139SMark   PetscFunctionBegin;
209a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
210a587d139SMark     *d_mat = aijkok->device_mat_d.data();
211a587d139SMark   } else {
2129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
213a587d139SMark     *d_mat  = NULL;
214a587d139SMark   }
215a587d139SMark   PetscFunctionReturn(0);
216a587d139SMark }
217076ba34aSJunchao Zhang 
218076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
220152b3e56SJunchao Zhang {
221152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222152b3e56SJunchao Zhang 
223152b3e56SJunchao Zhang   PetscFunctionBegin;
2245f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
226076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
2279566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
2289566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
2299566063dSJacob Faibussowitsch     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
23086a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
231076ba34aSJunchao Zhang   }
232076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
233152b3e56SJunchao Zhang   PetscFunctionReturn(0);
234152b3e56SJunchao Zhang }
235152b3e56SJunchao Zhang 
236076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
237076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
238152b3e56SJunchao Zhang {
239152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
240152b3e56SJunchao Zhang 
241152b3e56SJunchao Zhang   PetscFunctionBegin;
2429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2435f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
244076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
2459566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
2469566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
2479566063dSJacob Faibussowitsch     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
248076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
249076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
250076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
251076ba34aSJunchao Zhang    #endif
25286a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
253076ba34aSJunchao Zhang   }
254076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
2559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
256152b3e56SJunchao Zhang   PetscFunctionReturn(0);
257152b3e56SJunchao Zhang }
258a587d139SMark 
2598c3ff71bSJunchao Zhang /* y = A x */
2608c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2618c3ff71bSJunchao Zhang {
2628c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
263152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
264152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2658c3ff71bSJunchao Zhang 
2668c3ff71bSJunchao Zhang   PetscFunctionBegin;
2679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2699566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
2709566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
2718c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
272152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
2739566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
2749566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
275076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
2769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
2779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2788c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2798c3ff71bSJunchao Zhang }
2808c3ff71bSJunchao Zhang 
2818c3ff71bSJunchao Zhang /* y = A^T x */
2828c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2838c3ff71bSJunchao Zhang {
2848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
285152b3e56SJunchao Zhang   const char                       *mode;
286152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
287152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
288076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
2919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2939566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
2949566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
295152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
2969566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
297152b3e56SJunchao Zhang     mode = "N";
298152b3e56SJunchao Zhang   } else {
299076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
300076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
301152b3e56SJunchao Zhang     mode = "T";
302152b3e56SJunchao Zhang   }
303076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
3049566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3059566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3088c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3098c3ff71bSJunchao Zhang }
3108c3ff71bSJunchao Zhang 
3118c3ff71bSJunchao Zhang /* y = A^H x */
3128c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
3138c3ff71bSJunchao Zhang {
3148c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
315152b3e56SJunchao Zhang   const char                       *mode;
316152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
317152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
318076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3198c3ff71bSJunchao Zhang 
3208c3ff71bSJunchao Zhang   PetscFunctionBegin;
3219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3239566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3249566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
325152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
327152b3e56SJunchao Zhang     mode = "N";
328152b3e56SJunchao Zhang   } else {
329076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
331152b3e56SJunchao Zhang     mode = "C";
332152b3e56SJunchao Zhang   }
333076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
3349566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3359566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3388c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3398c3ff71bSJunchao Zhang }
3408c3ff71bSJunchao Zhang 
3418c3ff71bSJunchao Zhang /* z = A x + y */
3428c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3438c3ff71bSJunchao Zhang {
3448c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
345152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
346152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3478c3ff71bSJunchao Zhang 
3488c3ff71bSJunchao Zhang   PetscFunctionBegin;
3499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3519566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3529566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3539566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3548c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3558c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
356152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
3579566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3589566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
3599566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
3609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
3619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3638c3ff71bSJunchao Zhang }
3648c3ff71bSJunchao Zhang 
3658c3ff71bSJunchao Zhang /* z = A^T x + y */
3668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3678c3ff71bSJunchao Zhang {
3688c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
369152b3e56SJunchao Zhang   const char                       *mode;
370152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
371152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
372076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3738c3ff71bSJunchao Zhang 
3748c3ff71bSJunchao Zhang   PetscFunctionBegin;
3759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3779566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3789566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3799566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3808c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
381152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
383152b3e56SJunchao Zhang     mode = "N";
384152b3e56SJunchao Zhang   } else {
385076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
386076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
387152b3e56SJunchao Zhang     mode = "T";
388152b3e56SJunchao Zhang   }
389076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
3909566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3919566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
3929566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
3939566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3958c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3968c3ff71bSJunchao Zhang }
3978c3ff71bSJunchao Zhang 
3988c3ff71bSJunchao Zhang /* z = A^H x + y */
3998c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
4008c3ff71bSJunchao Zhang {
4018c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
402152b3e56SJunchao Zhang   const char                       *mode;
403152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
404152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
405076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
4068c3ff71bSJunchao Zhang 
4078c3ff71bSJunchao Zhang   PetscFunctionBegin;
4089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4109566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
4119566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
4129566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
4138c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
414152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
416152b3e56SJunchao Zhang     mode = "N";
417152b3e56SJunchao Zhang   } else {
418076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
419076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
420152b3e56SJunchao Zhang     mode = "C";
421152b3e56SJunchao Zhang   }
422076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
4239566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
4249566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
4259566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
4269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
4279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
428152b3e56SJunchao Zhang   PetscFunctionReturn(0);
429152b3e56SJunchao Zhang }
430152b3e56SJunchao Zhang 
431152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
432152b3e56SJunchao Zhang {
433152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
434152b3e56SJunchao Zhang 
435152b3e56SJunchao Zhang   PetscFunctionBegin;
436152b3e56SJunchao Zhang   switch (op) {
437152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
438152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
4399566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
440152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
441152b3e56SJunchao Zhang       break;
442152b3e56SJunchao Zhang     default:
4439566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
444152b3e56SJunchao Zhang       break;
445152b3e56SJunchao Zhang   }
4468c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4478c3ff71bSJunchao Zhang }
4488c3ff71bSJunchao Zhang 
449076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4503d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4518c3ff71bSJunchao Zhang {
452076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4538c3ff71bSJunchao Zhang 
4548c3ff71bSJunchao Zhang   PetscFunctionBegin;
4559566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
456076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
4579566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
4588c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
4599566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
460076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4615f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
4629566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
4639566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
4649566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
4659566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
466076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
467394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4685f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
469076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4708c3ff71bSJunchao Zhang     }
471076ba34aSJunchao Zhang   }
4728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4738c3ff71bSJunchao Zhang }
4748c3ff71bSJunchao Zhang 
475076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
476076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
477076ba34aSJunchao Zhang  */
478076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4798c3ff71bSJunchao Zhang {
480076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
481076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
482076ba34aSJunchao Zhang   Mat                   mat;
4838c3ff71bSJunchao Zhang 
4848c3ff71bSJunchao Zhang   PetscFunctionBegin;
485076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
4869566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
487076ba34aSJunchao Zhang   mat  = *B;
488076ba34aSJunchao Zhang   if (A->assembled) {
489076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
490076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
491076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
492076ba34aSJunchao Zhang     /* Now copy values to B if needed */
493076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
494076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
495076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
496076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
497076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
498076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
499076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
500076ba34aSJunchao Zhang       }
501076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
502076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
503076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
504076ba34aSJunchao Zhang     }
505076ba34aSJunchao Zhang     mat->spptr = bkok;
506076ba34aSJunchao Zhang   }
507076ba34aSJunchao Zhang 
5089566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5099566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
5109566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
5119566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5128c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5138c3ff71bSJunchao Zhang }
5148c3ff71bSJunchao Zhang 
5150ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5160ecb592aSJunchao Zhang {
5170ecb592aSJunchao Zhang   Mat               At;
518ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5190ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5200ecb592aSJunchao Zhang 
5210ecb592aSJunchao Zhang   PetscFunctionBegin;
522*7fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B));
5239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
5240ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
525ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
5269566063dSJacob Faibussowitsch     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5279566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
5280ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5299566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
5300ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5310ecb592aSJunchao Zhang     if ((*B)->assembled) {
5320ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5339566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5349566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
5350ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5360ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5370ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5380ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5399566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
5409566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5410ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5420ecb592aSJunchao Zhang   }
5430ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5440ecb592aSJunchao Zhang }
5450ecb592aSJunchao Zhang 
5468c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5478c3ff71bSJunchao Zhang {
54886a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5498c3ff71bSJunchao Zhang 
5508c3ff71bSJunchao Zhang   PetscFunctionBegin;
55186a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
55286a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5538c3ff71bSJunchao Zhang     delete aijkok;
55486a27549SJunchao Zhang   } else {
55586a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
55686a27549SJunchao Zhang   }
557cbc6b225SStefano Zampini   A->spptr = NULL;
5589566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
5599566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
5609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
5619566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
5628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5638c3ff71bSJunchao Zhang }
5648c3ff71bSJunchao Zhang 
5653f3ba80aSJunchao Zhang /*MC
5663f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5673f3ba80aSJunchao Zhang 
5683f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5693f3ba80aSJunchao Zhang 
5703f3ba80aSJunchao Zhang    Options Database Keys:
5713f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5723f3ba80aSJunchao Zhang 
5733f3ba80aSJunchao Zhang   Level: beginner
5743f3ba80aSJunchao Zhang 
575db781477SPatrick Sanan .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
5763f3ba80aSJunchao Zhang M*/
57786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
57886a27549SJunchao Zhang {
57986a27549SJunchao Zhang   PetscFunctionBegin;
5809566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
5819566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
5829566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
58386a27549SJunchao Zhang   PetscFunctionReturn(0);
58486a27549SJunchao Zhang }
58586a27549SJunchao Zhang 
586076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
587076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
588a3f881fbSStefano Zampini {
589076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
590076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
591076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
592076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
593076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
594076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
595a3f881fbSStefano Zampini 
596a3f881fbSStefano Zampini   PetscFunctionBegin;
597076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
598076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
599076ba34aSJunchao Zhang   PetscValidPointer(C,4);
600076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
601076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
6025f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
6035f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
604076ba34aSJunchao Zhang 
6059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
607076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
608076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
609076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
610076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
611076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
612076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
613076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
614076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
615076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
616076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
617076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
618076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
619076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
620076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
621076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
622076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
623076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
624076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
625076ba34aSJunchao Zhang     ca = ca_dual.view_device();
626076ba34aSJunchao Zhang     ci = ci_dual.view_device();
627076ba34aSJunchao Zhang     cj = cj_dual.view_device();
628076ba34aSJunchao Zhang 
629076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
630076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
631076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
632076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
633076ba34aSJunchao Zhang 
634076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
635076ba34aSJunchao Zhang         ci(i) = coffset;
636076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
637076ba34aSJunchao Zhang       });
638076ba34aSJunchao Zhang 
639076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
640076ba34aSJunchao Zhang         if (k < alen) {
641076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
642076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
643076ba34aSJunchao Zhang         } else {
644076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
645076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
646076ba34aSJunchao Zhang         }
647076ba34aSJunchao Zhang       });
648076ba34aSJunchao Zhang     });
649076ba34aSJunchao Zhang     ca_dual.modify_device();
650076ba34aSJunchao Zhang     ci_dual.modify_device();
651076ba34aSJunchao Zhang     cj_dual.modify_device();
6529566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
6539566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
654076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
655076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
656076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
657076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
658076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
659076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
660076ba34aSJunchao Zhang 
661076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
662076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
663076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
664076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
665076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
666076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
667076ba34aSJunchao Zhang       });
668076ba34aSJunchao Zhang     });
6699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
670076ba34aSJunchao Zhang   }
671076ba34aSJunchao Zhang   PetscFunctionReturn(0);
672076ba34aSJunchao Zhang }
673076ba34aSJunchao Zhang 
674076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
675076ba34aSJunchao Zhang {
676076ba34aSJunchao Zhang   PetscFunctionBegin;
677076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
678a3f881fbSStefano Zampini   PetscFunctionReturn(0);
679a3f881fbSStefano Zampini }
680a3f881fbSStefano Zampini 
681a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
682a3f881fbSStefano Zampini {
683a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
684a3f881fbSStefano Zampini   Mat                            A,B;
685076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
686a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
687a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
688076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
689076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
690a3f881fbSStefano Zampini 
691a3f881fbSStefano Zampini   PetscFunctionBegin;
692a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6935f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
694076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
695076ba34aSJunchao Zhang 
696076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
697076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
698076ba34aSJunchao Zhang     PetscFunctionReturn(0);
699076ba34aSJunchao Zhang   }
700076ba34aSJunchao Zhang 
701076ba34aSJunchao Zhang   switch (product->type) {
702076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
703076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
704076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
705076ba34aSJunchao Zhang     default:
70698921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
707076ba34aSJunchao Zhang   }
708076ba34aSJunchao Zhang 
709a3f881fbSStefano Zampini   A     = product->A;
710a3f881fbSStefano Zampini   B     = product->B;
7119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
713a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
714a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
715a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
716076ba34aSJunchao Zhang 
7175f80ce2aSJacob Faibussowitsch   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
718076ba34aSJunchao Zhang 
719076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
720076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
721076ba34aSJunchao Zhang 
722076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
723076ba34aSJunchao Zhang   if (transA) {
7249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
725076ba34aSJunchao Zhang     transA = false;
726a3f881fbSStefano Zampini   }
727a3f881fbSStefano Zampini 
728076ba34aSJunchao Zhang   if (transB) {
7299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
730076ba34aSJunchao Zhang     transB = false;
731076ba34aSJunchao Zhang   }
7329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
7339566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
7349566063dSJacob Faibussowitsch   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
7359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
7369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
737a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
738a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7399566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
7409566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
7419566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
742a3f881fbSStefano Zampini   c->reallocs         = 0;
743076ba34aSJunchao Zhang   C->info.mallocs     = 0;
744a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
745a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
746a3f881fbSStefano Zampini   C->num_ass++;
747a3f881fbSStefano Zampini   PetscFunctionReturn(0);
748a3f881fbSStefano Zampini }
749a3f881fbSStefano Zampini 
750a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
751a3f881fbSStefano Zampini {
752076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
753076ba34aSJunchao Zhang   MatProductType                 ptype;
754076ba34aSJunchao Zhang   Mat                            A,B;
755076ba34aSJunchao Zhang   bool                           transA,transB;
756076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
757076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
758076ba34aSJunchao Zhang   MPI_Comm                       comm;
759076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
760a3f881fbSStefano Zampini 
761a3f881fbSStefano Zampini   PetscFunctionBegin;
762a3f881fbSStefano Zampini   MatCheckProduct(C,1);
7639566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
7645f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
765a3f881fbSStefano Zampini   A       = product->A;
766a3f881fbSStefano Zampini   B       = product->B;
7679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
769a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
770a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
771076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
772076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
773076ba34aSJunchao Zhang 
774a3f881fbSStefano Zampini   ptype   = product->type;
775a3f881fbSStefano Zampini   switch (ptype) {
776076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
777076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
778076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
779a3f881fbSStefano Zampini     default:
78098921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
781a3f881fbSStefano Zampini   }
782a3f881fbSStefano Zampini 
783076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
784076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
785076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
786076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
787a3f881fbSStefano Zampini 
788076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
789076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7906ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7916ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7926ffb9508SJunchao Zhang    */
793076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
794076ba34aSJunchao Zhang 
7959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
796076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
797076ba34aSJunchao Zhang   if (transA) {
7989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
799076ba34aSJunchao Zhang     transA = false;
800076ba34aSJunchao Zhang   }
801076ba34aSJunchao Zhang 
802076ba34aSJunchao Zhang   if (transB) {
8039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
804076ba34aSJunchao Zhang     transB = false;
805076ba34aSJunchao Zhang   }
806076ba34aSJunchao Zhang 
8079566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
808076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
809076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
810076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
811076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
812076ba34aSJunchao Zhang   */
8139566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
8149566063dSJacob Faibussowitsch   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
8159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
816076ba34aSJunchao Zhang 
8179566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
8189566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
819076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
820a3f881fbSStefano Zampini   PetscFunctionReturn(0);
821a3f881fbSStefano Zampini }
822a3f881fbSStefano Zampini 
823a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
824076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
825a3f881fbSStefano Zampini {
826076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
827a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
828a3f881fbSStefano Zampini 
829a3f881fbSStefano Zampini   PetscFunctionBegin;
830a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
8319566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
832a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
8339566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
834a3f881fbSStefano Zampini   }
835a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
836a3f881fbSStefano Zampini     switch (product->type) {
837a3f881fbSStefano Zampini     case MATPRODUCT_AB:
838a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
839a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
840a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
841a3f881fbSStefano Zampini       break;
842a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
843a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
844a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
845a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
846a3f881fbSStefano Zampini       break;
847a3f881fbSStefano Zampini     default:
848a3f881fbSStefano Zampini       break;
849a3f881fbSStefano Zampini     }
850a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
8519566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
852a3f881fbSStefano Zampini   }
853a3f881fbSStefano Zampini   PetscFunctionReturn(0);
854a3f881fbSStefano Zampini }
855a587d139SMark 
856f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
857f0cf5187SStefano Zampini {
858f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
859f0cf5187SStefano Zampini 
860f0cf5187SStefano Zampini   PetscFunctionBegin;
8619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
863f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
864076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
8659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
8669566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
8679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
868f0cf5187SStefano Zampini   PetscFunctionReturn(0);
869f0cf5187SStefano Zampini }
870f0cf5187SStefano Zampini 
871a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
872a587d139SMark {
873076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
874a587d139SMark 
875a587d139SMark   PetscFunctionBegin;
876076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8772328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
878076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
8799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
8802328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8819566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
8822328674fSJunchao Zhang   }
883a587d139SMark   PetscFunctionReturn(0);
884a587d139SMark }
885a587d139SMark 
886f78ce678SMark Adams static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)
887f78ce678SMark Adams {
888f78ce678SMark Adams   Mat_SeqAIJ                   *aijseq;
889f78ce678SMark Adams   Mat_SeqAIJKokkos             *aijkok;
890f78ce678SMark Adams   PetscInt                     n;
891f78ce678SMark Adams   PetscScalarKokkosView        xv;
892f78ce678SMark Adams 
893f78ce678SMark Adams   PetscFunctionBegin;
894f78ce678SMark Adams   PetscCall(VecGetLocalSize(x,&n));
895f78ce678SMark Adams   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
896f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
897f78ce678SMark Adams 
898f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
899f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
900f78ce678SMark Adams 
901f78ce678SMark Adams   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
902f78ce678SMark Adams     PetscCall(MatMarkDiagonal_SeqAIJ(A));
903f78ce678SMark Adams     aijseq = static_cast<Mat_SeqAIJ*>(A->data);
904f78ce678SMark Adams     aijkok->SetDiagonal(aijseq->diag);
905f78ce678SMark Adams   }
906f78ce678SMark Adams 
907f78ce678SMark Adams   const auto& Aa = aijkok->a_dual.view_device();
908f78ce678SMark Adams   const auto& Ai = aijkok->i_dual.view_device();
909f78ce678SMark Adams   const auto& Adiag = aijkok->diag_dual.view_device();
910f78ce678SMark Adams 
911f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x,&xv));
912f78ce678SMark Adams   Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
913f78ce678SMark Adams     if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i));
914f78ce678SMark Adams     else xv(i) = 0;
915f78ce678SMark Adams   });
916f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
917f78ce678SMark Adams   PetscFunctionReturn(0);
918f78ce678SMark Adams }
919f78ce678SMark Adams 
920db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
92142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
922db78de30SJunchao Zhang {
923db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
924db78de30SJunchao Zhang 
925db78de30SJunchao Zhang   PetscFunctionBegin;
926db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
927db78de30SJunchao Zhang   PetscValidPointer(kv,2);
928db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
930db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
932db78de30SJunchao Zhang   PetscFunctionReturn(0);
933db78de30SJunchao Zhang }
934db78de30SJunchao Zhang 
93542550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
936db78de30SJunchao Zhang {
937db78de30SJunchao Zhang   PetscFunctionBegin;
938db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
939db78de30SJunchao Zhang   PetscValidPointer(kv,2);
940db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
941db78de30SJunchao Zhang   PetscFunctionReturn(0);
942db78de30SJunchao Zhang }
943db78de30SJunchao Zhang 
94442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
945db78de30SJunchao Zhang {
946db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
947db78de30SJunchao Zhang 
948db78de30SJunchao Zhang   PetscFunctionBegin;
949db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950db78de30SJunchao Zhang   PetscValidPointer(kv,2);
951db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
953db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
954076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
955db78de30SJunchao Zhang   PetscFunctionReturn(0);
956db78de30SJunchao Zhang }
957db78de30SJunchao Zhang 
95842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
959db78de30SJunchao Zhang {
960db78de30SJunchao Zhang   PetscFunctionBegin;
961db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
962db78de30SJunchao Zhang   PetscValidPointer(kv,2);
963db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
965db78de30SJunchao Zhang   PetscFunctionReturn(0);
966db78de30SJunchao Zhang }
967db78de30SJunchao Zhang 
96842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
969db78de30SJunchao Zhang {
970db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
971db78de30SJunchao Zhang 
972db78de30SJunchao Zhang   PetscFunctionBegin;
973db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
974db78de30SJunchao Zhang   PetscValidPointer(kv,2);
975db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
976db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
977076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
978db78de30SJunchao Zhang   PetscFunctionReturn(0);
979db78de30SJunchao Zhang }
980db78de30SJunchao Zhang 
98142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
982db78de30SJunchao Zhang {
983db78de30SJunchao Zhang   PetscFunctionBegin;
984db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
985db78de30SJunchao Zhang   PetscValidPointer(kv,2);
986db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
988db78de30SJunchao Zhang   PetscFunctionReturn(0);
989db78de30SJunchao Zhang }
990db78de30SJunchao Zhang 
991c17cf699SJunchao Zhang /* Computes Y += alpha X */
992c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
993a587d139SMark {
994a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
995c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
996c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
997c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
998a587d139SMark 
999a587d139SMark   PetscFunctionBegin;
1000c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
1001c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
10029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
10039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
10049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1005db78de30SJunchao Zhang 
1006c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1007c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1008a587d139SMark     PetscBool e;
10099566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
1010a587d139SMark     if (e) {
10119566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
1012c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1013a587d139SMark     }
1014a587d139SMark   }
1015db78de30SJunchao Zhang 
1016c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1017c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1018c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1019c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1020c17cf699SJunchao Zhang   */
1021c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
1022c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
1023c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1024c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1025c17cf699SJunchao Zhang 
1026c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1027c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
10289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1029c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1030c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1031c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1032c17cf699SJunchao Zhang 
1033c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1034c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1035c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1036c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1037c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1038c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1039c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1040c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1041c17cf699SJunchao Zhang             q++;
1042a587d139SMark           } else {
1043c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1044c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1045c17cf699SJunchao Zhang             */
10468b8b16f9SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
10478b8b16f9SJunchao Zhang            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
10488b8b16f9SJunchao Zhang               Kokkos::nan("1"); /* auto promote the double NaN if needed */
10498b8b16f9SJunchao Zhang            #else
10508b8b16f9SJunchao Zhang               Kokkos::Experimental::nan("1");
10518b8b16f9SJunchao Zhang            #endif
1052a587d139SMark           }
1053c17cf699SJunchao Zhang         }
1054c17cf699SJunchao Zhang       });
1055c17cf699SJunchao Zhang     });
10569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1057c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1058c17cf699SJunchao Zhang     Mat             Z;
1059c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1060c17cf699SJunchao Zhang     KernelHandle    kh;
1061c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1062c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1063c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1064c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
10659566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
10669566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y,&Z));
1067c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1068c17cf699SJunchao Zhang   }
10699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1071a587d139SMark   PetscFunctionReturn(0);
1072a587d139SMark }
1073a587d139SMark 
1074e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
107542550becSJunchao Zhang {
107642550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
107742550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
107842550becSJunchao Zhang 
107942550becSJunchao Zhang   PetscFunctionBegin;
10809566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1081394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
108242550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1083cbc6b225SStefano Zampini   delete akok;
1084cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
10859566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1086394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
108742550becSJunchao Zhang   PetscFunctionReturn(0);
108842550becSJunchao Zhang }
108942550becSJunchao Zhang 
109042550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
109142550becSJunchao Zhang {
109242550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
109342550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1094394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1095394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1096394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
109742550becSJunchao Zhang   MatScalarKokkosView         Aa;
109842550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
109942550becSJunchao Zhang   PetscMemType                memtype;
110042550becSJunchao Zhang 
110142550becSJunchao Zhang   PetscFunctionBegin;
11029566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v,&memtype));
110342550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1104394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
110542550becSJunchao Zhang   } else {
1106394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
110742550becSJunchao Zhang   }
110842550becSJunchao Zhang 
1109c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1110c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
111142550becSJunchao Zhang 
1112c7b718f4SJunchao Zhang   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1113c7b718f4SJunchao Zhang     PetscScalar sum = 0.0;
1114c7b718f4SJunchao Zhang     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1115c7b718f4SJunchao Zhang     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1116c7b718f4SJunchao Zhang   });
1117394ed5ebSJunchao Zhang 
11189566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
11199566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
112042550becSJunchao Zhang   PetscFunctionReturn(0);
112142550becSJunchao Zhang }
112242550becSJunchao Zhang 
11235fbaff96SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
11245fbaff96SJunchao Zhang {
11255fbaff96SJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
11265fbaff96SJunchao Zhang   MatScalarKokkosView         Aa;
11275fbaff96SJunchao Zhang   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
11285fbaff96SJunchao Zhang   PetscInt                    m = A->rmap->n;
11295fbaff96SJunchao Zhang   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
11305fbaff96SJunchao Zhang 
11315fbaff96SJunchao Zhang   PetscFunctionBegin;
11325fbaff96SJunchao Zhang   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
11335fbaff96SJunchao Zhang   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
11345fbaff96SJunchao Zhang     PetscScalar tmp;
11355fbaff96SJunchao Zhang     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
11365fbaff96SJunchao Zhang       tmp          = Aa(Ai(i));
11375fbaff96SJunchao Zhang       Aa(Ai(i))    = Aa(Adiag(i));
11385fbaff96SJunchao Zhang       Aa(Adiag(i)) = tmp;
11395fbaff96SJunchao Zhang     }
11405fbaff96SJunchao Zhang   });
11415fbaff96SJunchao Zhang   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
11425fbaff96SJunchao Zhang   PetscFunctionReturn(0);
11435fbaff96SJunchao Zhang }
11445fbaff96SJunchao Zhang 
114586a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11468f7e8f9dSMark Adams {
11478f7e8f9dSMark Adams   PetscFunctionBegin;
11489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
11499566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
11508f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11518f7e8f9dSMark Adams   PetscFunctionReturn(0);
11528f7e8f9dSMark Adams }
11538f7e8f9dSMark Adams 
11548c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11558c3ff71bSJunchao Zhang {
1156076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1157076ba34aSJunchao Zhang 
11588c3ff71bSJunchao Zhang   PetscFunctionBegin;
1159076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11606f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11616f3d89d0SStefano Zampini 
11628c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11638c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11648c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1165a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1166f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1167a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1168076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11698c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11708c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11718c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11728c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11738c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11748c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1175076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11760ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1177152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1178f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1179076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1180076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1181076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1182076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1183076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1184076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
11857ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
118642550becSJunchao Zhang 
11879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
11889566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1189076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1190076ba34aSJunchao Zhang }
1191076ba34aSJunchao Zhang 
1192076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1193076ba34aSJunchao Zhang {
1194076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1195076ba34aSJunchao Zhang   PetscInt    i,m,n;
1196076ba34aSJunchao Zhang 
1197076ba34aSJunchao Zhang   PetscFunctionBegin;
11985f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1199076ba34aSJunchao Zhang 
1200076ba34aSJunchao Zhang   m = akok->nrows();
1201076ba34aSJunchao Zhang   n = akok->ncols();
12029566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A,m,n,m,n));
12039566063dSJacob Faibussowitsch   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1204076ba34aSJunchao Zhang 
1205076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
12069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1207076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1208076ba34aSJunchao Zhang 
1209076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1210076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1211076ba34aSJunchao Zhang 
1212076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1213076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1214076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1215076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1216076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1217076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1218076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1219076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1220076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1221076ba34aSJunchao Zhang 
12229566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->imax));
12239566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->ilen));
1224076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1225076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1226076ba34aSJunchao Zhang   }
1227076ba34aSJunchao Zhang 
1228076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1229076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1230ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
12319566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
12329566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1233076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1234076ba34aSJunchao Zhang }
1235076ba34aSJunchao Zhang 
1236076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1237076ba34aSJunchao Zhang 
1238076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1239076ba34aSJunchao Zhang  */
1240076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1241076ba34aSJunchao Zhang {
1242076ba34aSJunchao Zhang   PetscFunctionBegin;
12439566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
12449566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
12458c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12468c3ff71bSJunchao Zhang }
12478c3ff71bSJunchao Zhang 
12488c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1249152b3e56SJunchao Zhang /*@C
12508c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12518c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12528c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12538c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12548c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12558c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12568c3ff71bSJunchao Zhang 
12578c3ff71bSJunchao Zhang    Collective
12588c3ff71bSJunchao Zhang 
12598c3ff71bSJunchao Zhang    Input Parameters:
12608c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12618c3ff71bSJunchao Zhang .  m - number of rows
12628c3ff71bSJunchao Zhang .  n - number of columns
12638c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12648c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12658c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12668c3ff71bSJunchao Zhang 
12678c3ff71bSJunchao Zhang    Output Parameter:
12688c3ff71bSJunchao Zhang .  A - the matrix
12698c3ff71bSJunchao Zhang 
12708c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12718c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12728c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12738c3ff71bSJunchao Zhang 
12748c3ff71bSJunchao Zhang    Notes:
12758c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12768c3ff71bSJunchao Zhang 
12778c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12788c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12798c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12808c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12818c3ff71bSJunchao Zhang 
12828c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12838c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12848c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12858c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12868c3ff71bSJunchao Zhang 
12878c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12888c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12898c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12908c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12918c3ff71bSJunchao Zhang 
12928c3ff71bSJunchao Zhang    Level: intermediate
12938c3ff71bSJunchao Zhang 
1294db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
12958c3ff71bSJunchao Zhang @*/
12968c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12978c3ff71bSJunchao Zhang {
12988c3ff71bSJunchao Zhang   PetscFunctionBegin;
12999566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
13009566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
13019566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
13029566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
13039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
13048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13058c3ff71bSJunchao Zhang }
1306930e68a5SMark Adams 
13078f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
13088f7e8f9dSMark Adams //
130946804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
13108f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
13118f7e8f9dSMark Adams //
13128f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1313930e68a5SMark Adams {
13148f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
13158f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13168f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
13178f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1318300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1319076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1320076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
13218f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
132246804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1323930e68a5SMark Adams 
1324930e68a5SMark Adams   PetscFunctionBegin;
13252c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1326b94d7dedSBarry Smith   PetscCall(MatIsStructurallySymmetric(A,&row_identity));
13272c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
13289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&r_h));
13299566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol,&ic_h));
13309566063dSJacob Faibussowitsch   PetscCall(ISGetSize(isicol,&nc));
13319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
13329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
13338f7e8f9dSMark Adams   {
13348f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13358f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13368f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
13378f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13388f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13398f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13408f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13418f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13428f7e8f9dSMark Adams     size_t flops_h = 0.0;
13438f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13448f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13458f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13468f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13478f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13488f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13498f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13508f7e8f9dSMark Adams     // Fill A --> fact
13518f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1352042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13538f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13548f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13558f7e8f9dSMark Adams         // zero rows of B
13568f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13578f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13588f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13598f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13608f7e8f9dSMark Adams             /* zero (unfactored row) */
13618f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13628f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13638f7e8f9dSMark Adams           });
13648f7e8f9dSMark Adams         // copy A into B
13658f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13668f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13678f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13688f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13698f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13708f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13718f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13728f7e8f9dSMark Adams               PetscScalar vala = av[j];
13738f7e8f9dSMark Adams               if (colb == rowb) {
13748f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13758f7e8f9dSMark Adams               } else {
13768f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13778f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13788f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13798f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13808f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13818f7e8f9dSMark Adams                     pba[j] = vala;
13828f7e8f9dSMark Adams                     set++;
13838f7e8f9dSMark Adams                     break;
13848f7e8f9dSMark Adams                   }
13858f7e8f9dSMark Adams                 }
13868f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13878f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13888f1da0b2SJunchao Zhang                #endif
13898f7e8f9dSMark Adams               }
13908f7e8f9dSMark Adams             }
13918f7e8f9dSMark Adams           });
13928f7e8f9dSMark Adams       });
13938f7e8f9dSMark Adams     Kokkos::fence();
1394930e68a5SMark Adams 
13958f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13968f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13978f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13988f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1399042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
14008f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
14018f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
14028f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
14038f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
14048f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
14058f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
14068f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
14078f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
14088f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
14098f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
14108f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
14118f7e8f9dSMark Adams               else {
14128f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
14138f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
14148f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
14158f7e8f9dSMark Adams                 size_t         st_idx;
14168f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
14178f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14188f7e8f9dSMark Adams                 // get column, there has got to be a better way
14198f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
14208f7e8f9dSMark Adams                     if (pjL[j] == ii) {
14218f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
14228f7e8f9dSMark Adams                       idx = j; // output
14238f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14248f7e8f9dSMark Adams                     }
14258f7e8f9dSMark Adams                 }, st_idx);
14268f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
14278f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
142899551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
142999551766SMark Adams #endif
143099551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14318f7e8f9dSMark Adams                 // U(i+1,:end)
14328f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
14338f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
14348f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
14358f7e8f9dSMark Adams                       if (col==myk) {
14368f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
14378f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14388f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14398f7e8f9dSMark Adams                       } else {
14408f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14418f7e8f9dSMark Adams                         PetscInt       high, low;
14428f7e8f9dSMark Adams                         const PetscInt *startj;
14438f7e8f9dSMark Adams                         if (col<myk) { // L
14448f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14458f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14468f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14478f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14488f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14498f7e8f9dSMark Adams                         } else {
14508f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14518f7e8f9dSMark Adams                           start = ba_d + idx;
14528f7e8f9dSMark Adams                           startj= bj_d + idx;
14538f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14548f7e8f9dSMark Adams                         }
14558f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14568f7e8f9dSMark Adams                         low  = 0;
14578f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14588f7e8f9dSMark Adams                         while (high-low > 5) {
14598f7e8f9dSMark Adams                           int t = (low+high)/2;
14608f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14618f7e8f9dSMark Adams                           else                 low  = t;
14628f7e8f9dSMark Adams                         }
14638f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14648f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14658f7e8f9dSMark Adams                         }
14668f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
146799551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
146899551766SMark Adams #endif
14698f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14708f7e8f9dSMark Adams                       }
14718f7e8f9dSMark Adams                     });
14728f7e8f9dSMark Adams               }
14738f7e8f9dSMark Adams             });
14748f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14758f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14768f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14778f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14788f7e8f9dSMark Adams       });
14798f7e8f9dSMark Adams     Kokkos::fence();
14808f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14819566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
14828f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14838f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14848f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14858f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14868f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14878f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14888f7e8f9dSMark Adams             if (i < end) {
14898f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14908f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14918f7e8f9dSMark Adams             }
14928f7e8f9dSMark Adams           });
14938f7e8f9dSMark Adams       });
14948f7e8f9dSMark Adams   }
14959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14969566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol,&ic_h));
14979566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&r_h));
14988f7e8f9dSMark Adams 
14999566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
15009566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isicol,&col_identity));
15018f7e8f9dSMark Adams   if (b->inode.size) {
15028f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
15038f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
15048f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
15058f7e8f9dSMark Adams   } else {
15068f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15078f7e8f9dSMark Adams   }
15088f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
15108f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15118f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15128f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15138f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15148f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15158f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15168f7e8f9dSMark Adams 
1517930e68a5SMark Adams   PetscFunctionReturn(0);
1518930e68a5SMark Adams }
1519930e68a5SMark Adams 
152086a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1521930e68a5SMark Adams {
1522930e68a5SMark Adams   PetscFunctionBegin;
15239566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
152486a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
152586a27549SJunchao Zhang   PetscFunctionReturn(0);
152686a27549SJunchao Zhang }
152786a27549SJunchao Zhang 
152886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
152986a27549SJunchao Zhang {
153086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
153186a27549SJunchao Zhang 
153286a27549SJunchao Zhang   PetscFunctionBegin;
153386a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
153486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
153586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
153686a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
153786a27549SJunchao Zhang   }
153886a27549SJunchao Zhang   PetscFunctionReturn(0);
153986a27549SJunchao Zhang }
154086a27549SJunchao Zhang 
154186a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
154286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
154386a27549SJunchao Zhang {
154486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1545076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
154686a27549SJunchao Zhang 
154786a27549SJunchao Zhang   PetscFunctionBegin;
154886a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
154986a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1550076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
155186a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1552076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1553076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
155486a27549SJunchao Zhang 
155586a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1556076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1557076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1558076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
155986a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
156086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
156186a27549SJunchao Zhang 
156286a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
156386a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
156486a27549SJunchao Zhang     */
1565076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1566076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
156786a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
156886a27549SJunchao Zhang 
156986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
157086a27549SJunchao Zhang 
157186a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1572076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
157386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1574076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1575076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
157686a27549SJunchao Zhang 
157786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1578076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1579076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1580076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
158186a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
158286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang     /* Sort indices. See comments above */
1585076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1586076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
158786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
158886a27549SJunchao Zhang 
158986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
159086a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
159186a27549SJunchao Zhang   }
159286a27549SJunchao Zhang   PetscFunctionReturn(0);
159386a27549SJunchao Zhang }
159486a27549SJunchao Zhang 
159586a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
159686a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
159786a27549SJunchao Zhang {
159886a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
159986a27549SJunchao Zhang   PetscScalarKokkosView          xv;
160086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
160186a27549SJunchao Zhang 
160286a27549SJunchao Zhang   PetscFunctionBegin;
16039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
16059566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
16069566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
160786a27549SJunchao Zhang   /* Solve L tmpv = b */
16089566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
160986a27549SJunchao Zhang   /* Solve Ux = tmpv */
16109566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
16119566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
16129566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
16139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
161486a27549SJunchao Zhang   PetscFunctionReturn(0);
161586a27549SJunchao Zhang }
161686a27549SJunchao Zhang 
1617076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
161886a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
161986a27549SJunchao Zhang {
162086a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
162186a27549SJunchao Zhang   PetscScalarKokkosView          xv;
162286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
162386a27549SJunchao Zhang 
162486a27549SJunchao Zhang   PetscFunctionBegin;
16259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
16279566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
16289566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
162986a27549SJunchao Zhang   /* Solve U^T tmpv = b */
163086a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
163186a27549SJunchao Zhang 
163286a27549SJunchao Zhang   /* Solve L^T x = tmpv */
163386a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
16349566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
16359566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
16369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
163786a27549SJunchao Zhang   PetscFunctionReturn(0);
163886a27549SJunchao Zhang }
163986a27549SJunchao Zhang 
164086a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
164186a27549SJunchao Zhang {
164286a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
164386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
164486a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
164586a27549SJunchao Zhang 
164686a27549SJunchao Zhang   PetscFunctionBegin;
16479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1649076ba34aSJunchao Zhang 
1650076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1651076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1652076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1653076ba34aSJunchao Zhang 
1654076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
165586a27549SJunchao Zhang 
165686a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
165786a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
165886a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
165986a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
166086a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
166186a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
166286a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
166386a27549SJunchao Zhang 
166486a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
166586a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
166686a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1667eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
16689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
166986a27549SJunchao Zhang   PetscFunctionReturn(0);
167086a27549SJunchao Zhang }
167186a27549SJunchao Zhang 
167286a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
167386a27549SJunchao Zhang {
167486a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
167586a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
167686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
167786a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
167886a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
167986a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
168086a27549SJunchao Zhang 
168186a27549SJunchao Zhang   PetscFunctionBegin;
16829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
168386a27549SJunchao Zhang   /* Rebuild factors */
168486a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
168586a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
168686a27549SJunchao Zhang 
168786a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
168886a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
168986a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
169086a27549SJunchao Zhang 
169186a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
169286a27549SJunchao Zhang 
169386a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
169486a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
169586a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
169686a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
169786a27549SJunchao Zhang 
169886a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1699076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1700076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1701076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
170286a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
170386a27549SJunchao Zhang 
170486a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
170586a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
170686a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
170786a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
170886a27549SJunchao Zhang 
170986a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
171086a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
171186a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
171286a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
171386a27549SJunchao Zhang  #else
171486a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
171586a27549SJunchao Zhang  #endif
171686a27549SJunchao Zhang 
171786a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
171886a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
171986a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
172086a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
172186a27549SJunchao Zhang 
172286a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
17239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
172486a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
172586a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
172686a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
172786a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
172886a27549SJunchao Zhang 
172986a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
173086a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1731930e68a5SMark Adams   PetscFunctionReturn(0);
1732930e68a5SMark Adams }
1733930e68a5SMark Adams 
17348f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17358f7e8f9dSMark Adams {
17368f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17378f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1738930e68a5SMark Adams 
17398f7e8f9dSMark Adams   PetscFunctionBegin;
17409566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
17418f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17428f7e8f9dSMark Adams   // move B data into Kokkos
17439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
17449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
17458f7e8f9dSMark Adams   {
17468f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1747300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17488f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1749300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1750300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17518f7e8f9dSMark Adams     }
17528f7e8f9dSMark Adams   }
17538f7e8f9dSMark Adams   PetscFunctionReturn(0);
17548f7e8f9dSMark Adams }
17558f7e8f9dSMark Adams 
175686a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1757930e68a5SMark Adams {
1758930e68a5SMark Adams   PetscFunctionBegin;
1759930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1760930e68a5SMark Adams   PetscFunctionReturn(0);
1761930e68a5SMark Adams }
1762930e68a5SMark Adams 
17638f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17648f7e8f9dSMark Adams {
17658f7e8f9dSMark Adams   PetscFunctionBegin;
17668f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17678f7e8f9dSMark Adams   PetscFunctionReturn(0);
17688f7e8f9dSMark Adams }
17698f7e8f9dSMark Adams 
1770930e68a5SMark Adams /*MC
177186a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
177286a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1773930e68a5SMark Adams 
1774930e68a5SMark Adams   Level: beginner
1775930e68a5SMark Adams 
1776db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1777930e68a5SMark Adams M*/
177886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1779930e68a5SMark Adams {
1780930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1781930e68a5SMark Adams 
1782930e68a5SMark Adams   PetscFunctionBegin;
17839566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
17849566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1785930e68a5SMark Adams   (*B)->factortype = ftype;
17869566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
17879566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1788930e68a5SMark Adams 
17898f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17909566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
179186a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
179286a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
179386a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
17949566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
179586a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
179686a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
179798921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1798930e68a5SMark Adams 
17999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
18009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1801930e68a5SMark Adams   PetscFunctionReturn(0);
1802930e68a5SMark Adams }
18038f7e8f9dSMark Adams 
18048f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
18058f7e8f9dSMark Adams {
18068f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
18078f7e8f9dSMark Adams 
18088f7e8f9dSMark Adams   PetscFunctionBegin;
18099566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
18109566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
18118f7e8f9dSMark Adams   (*B)->factortype = ftype;
1812f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
18139566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
18149566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
18158f7e8f9dSMark Adams 
18168f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18179566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
18188f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
18198f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
18208f7e8f9dSMark Adams 
18219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
18229566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
18238f7e8f9dSMark Adams   PetscFunctionReturn(0);
18248f7e8f9dSMark Adams }
182586a27549SJunchao Zhang 
182686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
182786a27549SJunchao Zhang {
182886a27549SJunchao Zhang   PetscFunctionBegin;
18299566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
18309566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
18319566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
183286a27549SJunchao Zhang   PetscFunctionReturn(0);
183386a27549SJunchao Zhang }
183486a27549SJunchao Zhang 
1835076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1836076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1837076ba34aSJunchao Zhang {
1838076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1839076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1840076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1841076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1842076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1843076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1844076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1845076ba34aSJunchao Zhang 
1846076ba34aSJunchao Zhang   PetscFunctionBegin;
18479566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1848076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
18499566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1850076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
185163a3b9bcSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1852076ba34aSJunchao Zhang     }
18539566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1854076ba34aSJunchao Zhang   }
1855076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1856076ba34aSJunchao Zhang }
1857