xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 866eb059839330cbfd55a25ceab33c006e5089e7)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
188c3ff71bSJunchao Zhang 
19f98996d3SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99)
20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp>
21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix;
229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix;
23f98996d3SJunchao Zhang #else
24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp>
25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix;
269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix;
27f98996d3SJunchao Zhang #endif
28f98996d3SJunchao Zhang 
298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
308c3ff71bSJunchao Zhang 
31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
34076ba34aSJunchao Zhang  */
359371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) {
36076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
37076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
388c3ff71bSJunchao Zhang 
398c3ff71bSJunchao Zhang   PetscFunctionBegin;
40076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
419566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
42076ba34aSJunchao Zhang 
43076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
44076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
45076ba34aSJunchao Zhang 
46076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
47076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
48076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
49076ba34aSJunchao Zhang   */
50076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
51076ba34aSJunchao Zhang     delete aijkok;
52076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
53076ba34aSJunchao Zhang     A->spptr = aijkok;
54076ba34aSJunchao Zhang   }
55076ba34aSJunchao Zhang 
565519a089SJose E. Roman   if (aijkok->device_mat_d.data()) {
57a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
58a587d139SMark   }
598c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
608c3ff71bSJunchao Zhang }
618c3ff71bSJunchao Zhang 
6286a27549SJunchao Zhang /* Sync CSR data to device if not yet */
639371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) {
648c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
658c3ff71bSJunchao Zhang 
668c3ff71bSJunchao Zhang   PetscFunctionBegin;
675f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device");
685f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
69076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
70076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
71580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
7286a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
738c3ff71bSJunchao Zhang   }
748c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
758c3ff71bSJunchao Zhang }
768c3ff71bSJunchao Zhang 
77076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
789371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) {
7986a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
8086a27549SJunchao Zhang 
8186a27549SJunchao Zhang   PetscFunctionBegin;
825f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
8386a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8486a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8586a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8686a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
889566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
8986a27549SJunchao Zhang   PetscFunctionReturn(0);
9086a27549SJunchao Zhang }
9186a27549SJunchao Zhang 
929371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) {
93f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
94f0cf5187SStefano Zampini 
95f0cf5187SStefano Zampini   PetscFunctionBegin;
96f0cf5187SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9786a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
985f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host");
995f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
100076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
101f0cf5187SStefano Zampini   PetscFunctionReturn(0);
102f0cf5187SStefano Zampini }
103f0cf5187SStefano Zampini 
1049371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
105076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
106f0cf5187SStefano Zampini 
107f0cf5187SStefano Zampini   PetscFunctionBegin;
1085519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1095519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1105519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1115519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1125519a089SJose E. Roman   */
1135519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
114076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
115076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
116076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
117076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
118076ba34aSJunchao Zhang   }
119076ba34aSJunchao Zhang   PetscFunctionReturn(0);
120076ba34aSJunchao Zhang }
121076ba34aSJunchao Zhang 
1229371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
123076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
124076ba34aSJunchao Zhang 
125076ba34aSJunchao Zhang   PetscFunctionBegin;
1265519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
127076ba34aSJunchao Zhang   PetscFunctionReturn(0);
128076ba34aSJunchao Zhang }
129076ba34aSJunchao Zhang 
1309371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) {
131076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
132076ba34aSJunchao Zhang 
133076ba34aSJunchao Zhang   PetscFunctionBegin;
1345519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
135076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
136076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1372328674fSJunchao Zhang   } else {
1382328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1392328674fSJunchao Zhang   }
140076ba34aSJunchao Zhang   PetscFunctionReturn(0);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
1439371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) {
144076ba34aSJunchao Zhang   PetscFunctionBegin;
145076ba34aSJunchao Zhang   *array = NULL;
146076ba34aSJunchao Zhang   PetscFunctionReturn(0);
147076ba34aSJunchao Zhang }
148076ba34aSJunchao Zhang 
1499371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
150076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
151076ba34aSJunchao Zhang 
152076ba34aSJunchao Zhang   PetscFunctionBegin;
1535519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
154076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1552328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1562328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
1572328674fSJunchao Zhang   }
158076ba34aSJunchao Zhang   PetscFunctionReturn(0);
159076ba34aSJunchao Zhang }
160076ba34aSJunchao Zhang 
1619371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) {
162076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
163076ba34aSJunchao Zhang 
164076ba34aSJunchao Zhang   PetscFunctionBegin;
1655519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
166076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
167076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1682328674fSJunchao Zhang   }
169f0cf5187SStefano Zampini   PetscFunctionReturn(0);
170f0cf5187SStefano Zampini }
171f0cf5187SStefano Zampini 
1729371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
1737ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1747ee59b9bSJunchao Zhang 
1757ee59b9bSJunchao Zhang   PetscFunctionBegin;
1767ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
1777ee59b9bSJunchao Zhang 
1787ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1797ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
1807ee59b9bSJunchao Zhang   if (a) {
1817ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
1827ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
1837ee59b9bSJunchao Zhang   }
1847ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
1857ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
1867ee59b9bSJunchao Zhang }
1877ee59b9bSJunchao Zhang 
188a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
1899371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) {
190042217e8SBarry Smith   Mat_SeqAIJKokkos                            *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
191042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
192a587d139SMark 
193a587d139SMark   PetscFunctionBegin;
1945f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
195152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k);
196a587d139SMark   Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k);
197a587d139SMark   PetscFunctionReturn(0);
198a587d139SMark }
199a587d139SMark 
200a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
2019371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) {
202042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
203a587d139SMark 
204a587d139SMark   PetscFunctionBegin;
205a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
206a587d139SMark     *d_mat = aijkok->device_mat_d.data();
207a587d139SMark   } else {
2089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
209a587d139SMark     *d_mat = NULL;
210a587d139SMark   }
211a587d139SMark   PetscFunctionReturn(0);
212a587d139SMark }
213076ba34aSJunchao Zhang 
214076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
2159371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) {
216152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
217152b3e56SJunchao Zhang 
218152b3e56SJunchao Zhang   PetscFunctionBegin;
2195f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
2229566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
223f98996d3SJunchao Zhang     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
224f98996d3SJunchao Zhang     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
22586a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
226076ba34aSJunchao Zhang   }
227076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
228152b3e56SJunchao Zhang   PetscFunctionReturn(0);
229152b3e56SJunchao Zhang }
230152b3e56SJunchao Zhang 
231076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
2329371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) {
233152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
234152b3e56SJunchao Zhang 
235152b3e56SJunchao Zhang   PetscFunctionBegin;
2369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2375f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
238076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
2399566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
240f98996d3SJunchao Zhang     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
241f98996d3SJunchao Zhang     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
242076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX)
243076ba34aSJunchao Zhang     const auto &a = aijkok->csrmatH.values;
2449371c9d4SSatish Balay     Kokkos::parallel_for(
2459371c9d4SSatish Balay       a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); });
246076ba34aSJunchao Zhang #endif
24786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
248076ba34aSJunchao Zhang   }
249076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
2509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
251152b3e56SJunchao Zhang   PetscFunctionReturn(0);
252152b3e56SJunchao Zhang }
253a587d139SMark 
2548c3ff71bSJunchao Zhang /* y = A x */
2559371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
2568c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
257152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
258152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
2598c3ff71bSJunchao Zhang 
2608c3ff71bSJunchao Zhang   PetscFunctionBegin;
2619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2639566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
2649566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
2658c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
266152b3e56SJunchao Zhang   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */
2679566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
2689566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
269076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
2709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
2719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2738c3ff71bSJunchao Zhang }
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang /* y = A^T x */
2769371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
2778c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
278152b3e56SJunchao Zhang   const char                *mode;
279152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
280152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
281076ba34aSJunchao Zhang   KokkosCsrMatrix           *csrmat;
2828c3ff71bSJunchao Zhang 
2838c3ff71bSJunchao Zhang   PetscFunctionBegin;
2849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2869566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
2879566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
288152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
2899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
290152b3e56SJunchao Zhang     mode = "N";
291152b3e56SJunchao Zhang   } else {
292076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
293076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
294152b3e56SJunchao Zhang     mode   = "T";
295152b3e56SJunchao Zhang   }
296076ba34aSJunchao Zhang   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */
2979566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
2989566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
2999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
3009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3018c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3028c3ff71bSJunchao Zhang }
3038c3ff71bSJunchao Zhang 
3048c3ff71bSJunchao Zhang /* y = A^H x */
3059371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) {
3068c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
307152b3e56SJunchao Zhang   const char                *mode;
308152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv;
309152b3e56SJunchao Zhang   PetscScalarKokkosView      yv;
310076ba34aSJunchao Zhang   KokkosCsrMatrix           *csrmat;
3118c3ff71bSJunchao Zhang 
3128c3ff71bSJunchao Zhang   PetscFunctionBegin;
3139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3159566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3169566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy, &yv));
317152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
319152b3e56SJunchao Zhang     mode = "N";
320152b3e56SJunchao Zhang   } else {
321076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
322076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
323152b3e56SJunchao Zhang     mode   = "C";
324152b3e56SJunchao Zhang   }
325076ba34aSJunchao Zhang   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */
3269566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3279566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
3289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
3299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3308c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3318c3ff71bSJunchao Zhang }
3328c3ff71bSJunchao Zhang 
3338c3ff71bSJunchao Zhang /* z = A x + y */
3349371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
3358c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
336152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
337152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
3388c3ff71bSJunchao Zhang 
3398c3ff71bSJunchao Zhang   PetscFunctionBegin;
3409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3429566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3439566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
3449566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
3458c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
3468c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
347152b3e56SJunchao Zhang   KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */
3489566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3499566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
3509566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
3519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
3529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3548c3ff71bSJunchao Zhang }
3558c3ff71bSJunchao Zhang 
3568c3ff71bSJunchao Zhang /* z = A^T x + y */
3579371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
3588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
359152b3e56SJunchao Zhang   const char                *mode;
360152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
361152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
362076ba34aSJunchao Zhang   KokkosCsrMatrix           *csrmat;
3638c3ff71bSJunchao Zhang 
3648c3ff71bSJunchao Zhang   PetscFunctionBegin;
3659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3679566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
3689566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
3699566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
3708c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
371152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
373152b3e56SJunchao Zhang     mode = "N";
374152b3e56SJunchao Zhang   } else {
375076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
376076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
377152b3e56SJunchao Zhang     mode   = "T";
378152b3e56SJunchao Zhang   }
379076ba34aSJunchao Zhang   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */
3809566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
3819566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
3829566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
3839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
3849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3858c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3868c3ff71bSJunchao Zhang }
3878c3ff71bSJunchao Zhang 
3888c3ff71bSJunchao Zhang /* z = A^H x + y */
3899371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) {
3908c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok;
391152b3e56SJunchao Zhang   const char                *mode;
392152b3e56SJunchao Zhang   ConstPetscScalarKokkosView xv, yv;
393152b3e56SJunchao Zhang   PetscScalarKokkosView      zv;
394076ba34aSJunchao Zhang   KokkosCsrMatrix           *csrmat;
3958c3ff71bSJunchao Zhang 
3968c3ff71bSJunchao Zhang   PetscFunctionBegin;
3979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3999566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx, &xv));
4009566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy, &yv));
4019566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz, &zv));
4028c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv, yv);
403152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
405152b3e56SJunchao Zhang     mode = "N";
406152b3e56SJunchao Zhang   } else {
407076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
409152b3e56SJunchao Zhang     mode   = "C";
410152b3e56SJunchao Zhang   }
411076ba34aSJunchao Zhang   KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */
4129566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx, &xv));
4139566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy, &yv));
4149566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz, &zv));
4159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz()));
4169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
417152b3e56SJunchao Zhang   PetscFunctionReturn(0);
418152b3e56SJunchao Zhang }
419152b3e56SJunchao Zhang 
4209371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) {
421152b3e56SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
422152b3e56SJunchao Zhang 
423152b3e56SJunchao Zhang   PetscFunctionBegin;
424152b3e56SJunchao Zhang   switch (op) {
425152b3e56SJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
426152b3e56SJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
4279566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
428152b3e56SJunchao Zhang     A->form_explicit_transpose = flg;
429152b3e56SJunchao Zhang     break;
4309371c9d4SSatish Balay   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
431152b3e56SJunchao Zhang   }
4328c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4338c3ff71bSJunchao Zhang }
4348c3ff71bSJunchao Zhang 
435076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4369371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
437076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
4388c3ff71bSJunchao Zhang 
4398c3ff71bSJunchao Zhang   PetscFunctionBegin;
4409566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
441076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {                      /* Build a brand new mat */
4429566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));  /* the returned newmat is a SeqAIJKokkos */
4438c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
4449566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
445076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
4465f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
4479566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
4489566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
4499566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
4509566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
451076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ *>(A->data);
452394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4535f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
454076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE);
4558c3ff71bSJunchao Zhang     }
456076ba34aSJunchao Zhang   }
4578c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4588c3ff71bSJunchao Zhang }
4598c3ff71bSJunchao Zhang 
460076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
461076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
462076ba34aSJunchao Zhang  */
4639371c9d4SSatish Balay static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) {
464076ba34aSJunchao Zhang   Mat_SeqAIJ       *bseq;
465076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
466076ba34aSJunchao Zhang   Mat               mat;
4678c3ff71bSJunchao Zhang 
4688c3ff71bSJunchao Zhang   PetscFunctionBegin;
469076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
4709566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
471076ba34aSJunchao Zhang   mat = *B;
472076ba34aSJunchao Zhang   if (A->assembled) {
473076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
474076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE);
475076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
476076ba34aSJunchao Zhang     /* Now copy values to B if needed */
477076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
478076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
479076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
480076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
481076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
482076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
483076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
484076ba34aSJunchao Zhang       }
485076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
486076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
487076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
488076ba34aSJunchao Zhang     }
489076ba34aSJunchao Zhang     mat->spptr = bkok;
490076ba34aSJunchao Zhang   }
491076ba34aSJunchao Zhang 
4929566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
4939566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
4949566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
4959566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
4968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4978c3ff71bSJunchao Zhang }
4988c3ff71bSJunchao Zhang 
4999371c9d4SSatish Balay static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) {
5000ecb592aSJunchao Zhang   Mat               At;
501ff751488SJunchao Zhang   KokkosCsrMatrix  *internT;
5020ecb592aSJunchao Zhang   Mat_SeqAIJKokkos *atkok, *bkok;
5030ecb592aSJunchao Zhang 
5040ecb592aSJunchao Zhang   PetscFunctionBegin;
5057fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
5069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
5070ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
508ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
5099566063dSJacob Faibussowitsch     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT)));
5109566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
5110ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5129566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
5130ecb592aSJunchao Zhang   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5140ecb592aSJunchao Zhang     if ((*B)->assembled) {
5150ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
5169566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values));
5179566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
5180ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5190ecb592aSJunchao Zhang       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
5200ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */
5210ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz());
5229566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(a_h, internT->values));
5239566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries));
5240ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
5250ecb592aSJunchao Zhang   }
5260ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5270ecb592aSJunchao Zhang }
5280ecb592aSJunchao Zhang 
5299371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) {
53086a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
5318c3ff71bSJunchao Zhang 
5328c3ff71bSJunchao Zhang   PetscFunctionBegin;
53386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
53486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
5358c3ff71bSJunchao Zhang     delete aijkok;
53686a27549SJunchao Zhang   } else {
53786a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
53886a27549SJunchao Zhang   }
539cbc6b225SStefano Zampini   A->spptr = NULL;
5409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
5419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
5429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
5439566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
5448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5458c3ff71bSJunchao Zhang }
5468c3ff71bSJunchao Zhang 
5473f3ba80aSJunchao Zhang /*MC
5483f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5493f3ba80aSJunchao Zhang 
5503f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5513f3ba80aSJunchao Zhang 
5523f3ba80aSJunchao Zhang    Options Database Keys:
5533f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5543f3ba80aSJunchao Zhang 
5553f3ba80aSJunchao Zhang   Level: beginner
5563f3ba80aSJunchao Zhang 
557db781477SPatrick Sanan .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
5583f3ba80aSJunchao Zhang M*/
5599371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) {
56086a27549SJunchao Zhang   PetscFunctionBegin;
5619566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
5629566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
5639566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
56486a27549SJunchao Zhang   PetscFunctionReturn(0);
56586a27549SJunchao Zhang }
56686a27549SJunchao Zhang 
567076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
5689371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
569076ba34aSJunchao Zhang   Mat_SeqAIJ         *a, *b;
570076ba34aSJunchao Zhang   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
571076ba34aSJunchao Zhang   MatScalarKokkosView aa, ba, ca;
572076ba34aSJunchao Zhang   MatRowMapKokkosView ai, bi, ci;
573076ba34aSJunchao Zhang   MatColIdxKokkosView aj, bj, cj;
574076ba34aSJunchao Zhang   PetscInt            m, n, nnz, aN;
575a3f881fbSStefano Zampini 
576a3f881fbSStefano Zampini   PetscFunctionBegin;
577076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
578076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
579076ba34aSJunchao Zhang   PetscValidPointer(C, 4);
580076ba34aSJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
581076ba34aSJunchao Zhang   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
5825f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
5835f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
584076ba34aSJunchao Zhang 
5859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
5869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
587076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ *>(A->data);
588076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ *>(B->data);
589076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
590076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
591076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
592076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
593076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
594076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
595076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
596076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
597076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
598076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
599076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
600076ba34aSJunchao Zhang     aj           = akok->j_dual.view_device();
601076ba34aSJunchao Zhang     bj           = bkok->j_dual.view_device();
602076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
603076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
604076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
605076ba34aSJunchao Zhang     ca           = ca_dual.view_device();
606076ba34aSJunchao Zhang     ci           = ci_dual.view_device();
607076ba34aSJunchao Zhang     cj           = cj_dual.view_device();
608076ba34aSJunchao Zhang 
609076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
6109371c9d4SSatish Balay     Kokkos::parallel_for(
6119371c9d4SSatish Balay       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
612076ba34aSJunchao Zhang         PetscInt i       = t.league_rank(); /* row i */
613076ba34aSJunchao Zhang         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
614076ba34aSJunchao Zhang 
615076ba34aSJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
616076ba34aSJunchao Zhang                                                    ci(i) = coffset;
617076ba34aSJunchao Zhang                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
618076ba34aSJunchao Zhang         });
619076ba34aSJunchao Zhang 
620076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
621076ba34aSJunchao Zhang           if (k < alen) {
622076ba34aSJunchao Zhang             ca(coffset + k) = aa(ai(i) + k);
623076ba34aSJunchao Zhang             cj(coffset + k) = aj(ai(i) + k);
624076ba34aSJunchao Zhang           } else {
625076ba34aSJunchao Zhang             ca(coffset + k) = ba(bi(i) + k - alen);
626076ba34aSJunchao Zhang             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
627076ba34aSJunchao Zhang           }
628076ba34aSJunchao Zhang         });
629076ba34aSJunchao Zhang       });
630076ba34aSJunchao Zhang     ca_dual.modify_device();
631076ba34aSJunchao Zhang     ci_dual.modify_device();
632076ba34aSJunchao Zhang     cj_dual.modify_device();
6339566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
6349566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
635076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
636076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
637076ba34aSJunchao Zhang     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
638076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
639076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
640076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
641076ba34aSJunchao Zhang 
6429371c9d4SSatish Balay     Kokkos::parallel_for(
6439371c9d4SSatish Balay       Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
644076ba34aSJunchao Zhang         PetscInt i    = t.league_rank(); /* row i */
645076ba34aSJunchao Zhang         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
646076ba34aSJunchao Zhang         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
647076ba34aSJunchao Zhang           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
648076ba34aSJunchao Zhang           else ca(ci(i) + k) = ba(bi(i) + k - alen);
649076ba34aSJunchao Zhang         });
650076ba34aSJunchao Zhang       });
6519566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
652076ba34aSJunchao Zhang   }
653076ba34aSJunchao Zhang   PetscFunctionReturn(0);
654076ba34aSJunchao Zhang }
655076ba34aSJunchao Zhang 
6569371c9d4SSatish Balay static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) {
657076ba34aSJunchao Zhang   PetscFunctionBegin;
658076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
659a3f881fbSStefano Zampini   PetscFunctionReturn(0);
660a3f881fbSStefano Zampini }
661a3f881fbSStefano Zampini 
6629371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) {
663a3f881fbSStefano Zampini   Mat_Product                 *product = C->product;
664a3f881fbSStefano Zampini   Mat                          A, B;
665076ba34aSJunchao Zhang   bool                         transA, transB; /* use bool, since KK needs this type */
666a3f881fbSStefano Zampini   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
667a3f881fbSStefano Zampini   Mat_SeqAIJ                  *c;
668076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
669076ba34aSJunchao Zhang   KokkosCsrMatrix             *csrmatA, *csrmatB;
670a3f881fbSStefano Zampini 
671a3f881fbSStefano Zampini   PetscFunctionBegin;
672a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
6735f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
674076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
675076ba34aSJunchao Zhang 
676076ba34aSJunchao Zhang   if (pdata->reusesym) {           /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
677076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
678076ba34aSJunchao Zhang     PetscFunctionReturn(0);
679076ba34aSJunchao Zhang   }
680076ba34aSJunchao Zhang 
681076ba34aSJunchao Zhang   switch (product->type) {
6829371c9d4SSatish Balay   case MATPRODUCT_AB:
6839371c9d4SSatish Balay     transA = false;
6849371c9d4SSatish Balay     transB = false;
6859371c9d4SSatish Balay     break;
6869371c9d4SSatish Balay   case MATPRODUCT_AtB:
6879371c9d4SSatish Balay     transA = true;
6889371c9d4SSatish Balay     transB = false;
6899371c9d4SSatish Balay     break;
6909371c9d4SSatish Balay   case MATPRODUCT_ABt:
6919371c9d4SSatish Balay     transA = false;
6929371c9d4SSatish Balay     transB = true;
6939371c9d4SSatish Balay     break;
6949371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
695076ba34aSJunchao Zhang   }
696076ba34aSJunchao Zhang 
697a3f881fbSStefano Zampini   A = product->A;
698a3f881fbSStefano Zampini   B = product->B;
6999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
701a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
702a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
703a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
704076ba34aSJunchao Zhang 
7055f80ce2aSJacob Faibussowitsch   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
706076ba34aSJunchao Zhang 
707076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
708076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
709076ba34aSJunchao Zhang 
710076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
711076ba34aSJunchao Zhang   if (transA) {
7129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
713076ba34aSJunchao Zhang     transA = false;
714a3f881fbSStefano Zampini   }
715a3f881fbSStefano Zampini 
716076ba34aSJunchao Zhang   if (transB) {
7179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
718076ba34aSJunchao Zhang     transB = false;
719076ba34aSJunchao Zhang   }
7209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
7219566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat));
722*866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
723*866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
724*866eb059SJunchao Zhang 
7259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
7269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
727a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
728a3f881fbSStefano Zampini   c = (Mat_SeqAIJ *)C->data;
7299566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
7309566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
7319566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
732a3f881fbSStefano Zampini   c->reallocs         = 0;
733076ba34aSJunchao Zhang   C->info.mallocs     = 0;
734a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
735a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
736a3f881fbSStefano Zampini   C->num_ass++;
737a3f881fbSStefano Zampini   PetscFunctionReturn(0);
738a3f881fbSStefano Zampini }
739a3f881fbSStefano Zampini 
7409371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) {
741076ba34aSJunchao Zhang   Mat_Product                 *product = C->product;
742076ba34aSJunchao Zhang   MatProductType               ptype;
743076ba34aSJunchao Zhang   Mat                          A, B;
744076ba34aSJunchao Zhang   bool                         transA, transB;
745076ba34aSJunchao Zhang   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
746076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos *pdata;
747076ba34aSJunchao Zhang   MPI_Comm                     comm;
748076ba34aSJunchao Zhang   KokkosCsrMatrix             *csrmatA, *csrmatB, csrmatC;
749a3f881fbSStefano Zampini 
750a3f881fbSStefano Zampini   PetscFunctionBegin;
751a3f881fbSStefano Zampini   MatCheckProduct(C, 1);
7529566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
7535f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
754a3f881fbSStefano Zampini   A = product->A;
755a3f881fbSStefano Zampini   B = product->B;
7569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
758a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
759a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
760076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
761076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
762076ba34aSJunchao Zhang 
763a3f881fbSStefano Zampini   ptype = product->type;
764a3f881fbSStefano Zampini   switch (ptype) {
7659371c9d4SSatish Balay   case MATPRODUCT_AB:
7669371c9d4SSatish Balay     transA = false;
7679371c9d4SSatish Balay     transB = false;
7689371c9d4SSatish Balay     break;
7699371c9d4SSatish Balay   case MATPRODUCT_AtB:
7709371c9d4SSatish Balay     transA = true;
7719371c9d4SSatish Balay     transB = false;
7729371c9d4SSatish Balay     break;
7739371c9d4SSatish Balay   case MATPRODUCT_ABt:
7749371c9d4SSatish Balay     transA = false;
7759371c9d4SSatish Balay     transB = true;
7769371c9d4SSatish Balay     break;
7779371c9d4SSatish Balay   default: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
778a3f881fbSStefano Zampini   }
779a3f881fbSStefano Zampini 
780076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
781076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
782076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
783076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
784a3f881fbSStefano Zampini 
785076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
786*866eb059SJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
787*866eb059SJunchao Zhang 
788*866eb059SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
789*866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
790*866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
791*866eb059SJunchao Zhang   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
792*866eb059SJunchao Zhang #endif
793*866eb059SJunchao Zhang #endif
794*866eb059SJunchao Zhang 
795076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
796076ba34aSJunchao Zhang 
7979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
798076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
799076ba34aSJunchao Zhang   if (transA) {
8009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
801076ba34aSJunchao Zhang     transA = false;
802076ba34aSJunchao Zhang   }
803076ba34aSJunchao Zhang 
804076ba34aSJunchao Zhang   if (transB) {
8059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
806076ba34aSJunchao Zhang     transB = false;
807076ba34aSJunchao Zhang   }
808076ba34aSJunchao Zhang 
8099566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
810*866eb059SJunchao Zhang 
811076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
812076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
813076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
814076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
815076ba34aSJunchao Zhang   */
8169566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC));
817*866eb059SJunchao Zhang   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
818*866eb059SJunchao Zhang   auto spgemmHandle = pdata->kh.get_spgemm_handle();
819*866eb059SJunchao Zhang   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
8209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
821076ba34aSJunchao Zhang 
8229566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
8239566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
824076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
825a3f881fbSStefano Zampini   PetscFunctionReturn(0);
826a3f881fbSStefano Zampini }
827a3f881fbSStefano Zampini 
828a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
8299371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) {
830076ba34aSJunchao Zhang   Mat_Product *product = mat->product;
831a3f881fbSStefano Zampini   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
832a3f881fbSStefano Zampini 
833a3f881fbSStefano Zampini   PetscFunctionBegin;
834a3f881fbSStefano Zampini   MatCheckProduct(mat, 1);
8359566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
83648a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
837a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
838a3f881fbSStefano Zampini     switch (product->type) {
839a3f881fbSStefano Zampini     case MATPRODUCT_AB:
840a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
8419371c9d4SSatish Balay     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; break;
842a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
843a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
8449371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
8459371c9d4SSatish Balay     default: break;
846a3f881fbSStefano Zampini     }
847a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
8489566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
849a3f881fbSStefano Zampini   }
850a3f881fbSStefano Zampini   PetscFunctionReturn(0);
851a3f881fbSStefano Zampini }
852a587d139SMark 
8539371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) {
854f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
855f0cf5187SStefano Zampini 
856f0cf5187SStefano Zampini   PetscFunctionBegin;
8579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
859f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
860076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
8619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
8629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
8639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
864f0cf5187SStefano Zampini   PetscFunctionReturn(0);
865f0cf5187SStefano Zampini }
866f0cf5187SStefano Zampini 
8679371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) {
868076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
869a587d139SMark 
870a587d139SMark   PetscFunctionBegin;
871076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
8722328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
873076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0);
8749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
8752328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8769566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
8772328674fSJunchao Zhang   }
878a587d139SMark   PetscFunctionReturn(0);
879a587d139SMark }
880a587d139SMark 
8819371c9d4SSatish Balay static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) {
882f78ce678SMark Adams   Mat_SeqAIJ           *aijseq;
883f78ce678SMark Adams   Mat_SeqAIJKokkos     *aijkok;
884f78ce678SMark Adams   PetscInt              n;
885f78ce678SMark Adams   PetscScalarKokkosView xv;
886f78ce678SMark Adams 
887f78ce678SMark Adams   PetscFunctionBegin;
888f78ce678SMark Adams   PetscCall(VecGetLocalSize(x, &n));
889f78ce678SMark Adams   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
890f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
891f78ce678SMark Adams 
892f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
893f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
894f78ce678SMark Adams 
895f78ce678SMark Adams   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
896f78ce678SMark Adams     PetscCall(MatMarkDiagonal_SeqAIJ(A));
897f78ce678SMark Adams     aijseq = static_cast<Mat_SeqAIJ *>(A->data);
898f78ce678SMark Adams     aijkok->SetDiagonal(aijseq->diag);
899f78ce678SMark Adams   }
900f78ce678SMark Adams 
901f78ce678SMark Adams   const auto &Aa    = aijkok->a_dual.view_device();
902f78ce678SMark Adams   const auto &Ai    = aijkok->i_dual.view_device();
903f78ce678SMark Adams   const auto &Adiag = aijkok->diag_dual.view_device();
904f78ce678SMark Adams 
905f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x, &xv));
9069371c9d4SSatish Balay   Kokkos::parallel_for(
9079371c9d4SSatish Balay     n, KOKKOS_LAMBDA(const PetscInt i) {
908f78ce678SMark Adams       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
909f78ce678SMark Adams       else xv(i) = 0;
910f78ce678SMark Adams     });
911f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
912f78ce678SMark Adams   PetscFunctionReturn(0);
913f78ce678SMark Adams }
914f78ce678SMark Adams 
915db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
9169371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) {
917db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
918db78de30SJunchao Zhang 
919db78de30SJunchao Zhang   PetscFunctionBegin;
920db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
921db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
922db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
924db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
925076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
926db78de30SJunchao Zhang   PetscFunctionReturn(0);
927db78de30SJunchao Zhang }
928db78de30SJunchao Zhang 
9299371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) {
930db78de30SJunchao Zhang   PetscFunctionBegin;
931db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
932db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
933db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
934db78de30SJunchao Zhang   PetscFunctionReturn(0);
935db78de30SJunchao Zhang }
936db78de30SJunchao Zhang 
9379371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) {
938db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
939db78de30SJunchao Zhang 
940db78de30SJunchao Zhang   PetscFunctionBegin;
941db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
942db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
943db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
945db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
946076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
947db78de30SJunchao Zhang   PetscFunctionReturn(0);
948db78de30SJunchao Zhang }
949db78de30SJunchao Zhang 
9509371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) {
951db78de30SJunchao Zhang   PetscFunctionBegin;
952db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
953db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
954db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
956db78de30SJunchao Zhang   PetscFunctionReturn(0);
957db78de30SJunchao Zhang }
958db78de30SJunchao Zhang 
9599371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) {
960db78de30SJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
961db78de30SJunchao Zhang 
962db78de30SJunchao Zhang   PetscFunctionBegin;
963db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
964db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
965db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
966db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
967076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
968db78de30SJunchao Zhang   PetscFunctionReturn(0);
969db78de30SJunchao Zhang }
970db78de30SJunchao Zhang 
9719371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) {
972db78de30SJunchao Zhang   PetscFunctionBegin;
973db78de30SJunchao Zhang   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
974db78de30SJunchao Zhang   PetscValidPointer(kv, 2);
975db78de30SJunchao Zhang   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
9769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
977db78de30SJunchao Zhang   PetscFunctionReturn(0);
978db78de30SJunchao Zhang }
979db78de30SJunchao Zhang 
980c17cf699SJunchao Zhang /* Computes Y += alpha X */
9819371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) {
982a587d139SMark   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
983c17cf699SJunchao Zhang   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
984c17cf699SJunchao Zhang   ConstMatScalarKokkosView Xa;
985c17cf699SJunchao Zhang   MatScalarKokkosView      Ya;
986a587d139SMark 
987a587d139SMark   PetscFunctionBegin;
988c17cf699SJunchao Zhang   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
989c17cf699SJunchao Zhang   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
9909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
9919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
9929566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
993db78de30SJunchao Zhang 
994c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
995c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
996a587d139SMark     PetscBool e;
9979566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
998a587d139SMark     if (e) {
9999566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1000c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1001a587d139SMark     }
1002a587d139SMark   }
1003db78de30SJunchao Zhang 
1004c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1005c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1006c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1007c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1008c17cf699SJunchao Zhang   */
1009c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1010c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1011c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1012c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1013c17cf699SJunchao Zhang 
1014c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1015c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha, Xa, Ya);
10169566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1017c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1018c17cf699SJunchao Zhang     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1019c17cf699SJunchao Zhang     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1020c17cf699SJunchao Zhang 
10219371c9d4SSatish Balay     Kokkos::parallel_for(
10229371c9d4SSatish Balay       Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1023c17cf699SJunchao Zhang         PetscInt i = t.league_rank();              /* row i */
1024c17cf699SJunchao Zhang         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */
1025c17cf699SJunchao Zhang                                                    PetscInt p, q = Yi(i);
1026c17cf699SJunchao Zhang                                                    for (p = Xi(i); p < Xi(i + 1); p++) {          /* For each nonzero on row i of X */
1027c17cf699SJunchao Zhang                                                      while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */
1028c17cf699SJunchao Zhang                                                      if (Xj(p) == Yj(q)) {                        /* Found it */
1029c17cf699SJunchao Zhang                                                        Ya(q) += alpha * Xa(p);
1030c17cf699SJunchao Zhang                                                        q++;
1031a587d139SMark                                                      } else {
1032c17cf699SJunchao Zhang                                                        /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1033c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1034c17cf699SJunchao Zhang             */
10359371c9d4SSatish Balay                                                        if (Yi(i) != Yi(i + 1))
10369371c9d4SSatish Balay                                                          Ya(Yi(i)) =
10378b8b16f9SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99)
10388b8b16f9SJunchao Zhang                                                            Kokkos::nan("1"); /* auto promote the double NaN if needed */
10398b8b16f9SJunchao Zhang #else
10408b8b16f9SJunchao Zhang               Kokkos::Experimental::nan("1");
10418b8b16f9SJunchao Zhang #endif
1042a587d139SMark                                                      }
1043c17cf699SJunchao Zhang                                                    }
1044c17cf699SJunchao Zhang         });
1045c17cf699SJunchao Zhang       });
10469566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1047c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1048c17cf699SJunchao Zhang     Mat             Z;
1049c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1050c17cf699SJunchao Zhang     KernelHandle    kh;
1051c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1052c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1053c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1054c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
10559566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
10569566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y, &Z));
1057c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1058c17cf699SJunchao Zhang   }
10599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */
1061a587d139SMark   PetscFunctionReturn(0);
1062a587d139SMark }
1063a587d139SMark 
10649371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
106542550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
106642550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
106742550becSJunchao Zhang 
106842550becSJunchao Zhang   PetscFunctionBegin;
10699566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1070394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
107142550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1072cbc6b225SStefano Zampini   delete akok;
1073cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE);
10749566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1075394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
107642550becSJunchao Zhang   PetscFunctionReturn(0);
107742550becSJunchao Zhang }
107842550becSJunchao Zhang 
10799371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) {
108042550becSJunchao Zhang   Mat_SeqAIJ                 *aseq = static_cast<Mat_SeqAIJ *>(A->data);
108142550becSJunchao Zhang   Mat_SeqAIJKokkos           *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1082394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1083394ed5ebSJunchao Zhang   const PetscCountKokkosView &jmap = akok->jmap_d;
1084394ed5ebSJunchao Zhang   const PetscCountKokkosView &perm = akok->perm_d;
108542550becSJunchao Zhang   MatScalarKokkosView         Aa;
108642550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
108742550becSJunchao Zhang   PetscMemType                memtype;
108842550becSJunchao Zhang 
108942550becSJunchao Zhang   PetscFunctionBegin;
10909566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
109142550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1092394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n));
109342550becSJunchao Zhang   } else {
1094394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */
109542550becSJunchao Zhang   }
109642550becSJunchao Zhang 
1097c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1098c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
109942550becSJunchao Zhang 
11009371c9d4SSatish Balay   Kokkos::parallel_for(
11019371c9d4SSatish Balay     Annz, KOKKOS_LAMBDA(const PetscCount i) {
1102c7b718f4SJunchao Zhang       PetscScalar sum = 0.0;
1103c7b718f4SJunchao Zhang       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1104c7b718f4SJunchao Zhang       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1105c7b718f4SJunchao Zhang     });
1106394ed5ebSJunchao Zhang 
11079566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
11089566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
110942550becSJunchao Zhang   PetscFunctionReturn(0);
111042550becSJunchao Zhang }
111142550becSJunchao Zhang 
11129371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) {
11135fbaff96SJunchao Zhang   Mat_SeqAIJKokkos          *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
11145fbaff96SJunchao Zhang   MatScalarKokkosView        Aa;
11155fbaff96SJunchao Zhang   const MatRowMapKokkosView &Ai = akok->i_dual.view_device();
11165fbaff96SJunchao Zhang   PetscInt                   m  = A->rmap->n;
11175fbaff96SJunchao Zhang   ConstMatRowMapKokkosView   Adiag(diag, m); /* diag is a device pointer */
11185fbaff96SJunchao Zhang 
11195fbaff96SJunchao Zhang   PetscFunctionBegin;
11205fbaff96SJunchao Zhang   PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa));
11219371c9d4SSatish Balay   Kokkos::parallel_for(
11229371c9d4SSatish Balay     m, KOKKOS_LAMBDA(const PetscInt i) {
11235fbaff96SJunchao Zhang       PetscScalar tmp;
11245fbaff96SJunchao Zhang       if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */
11255fbaff96SJunchao Zhang         tmp          = Aa(Ai(i));
11265fbaff96SJunchao Zhang         Aa(Ai(i))    = Aa(Adiag(i));
11275fbaff96SJunchao Zhang         Aa(Adiag(i)) = tmp;
11285fbaff96SJunchao Zhang       }
11295fbaff96SJunchao Zhang     });
11305fbaff96SJunchao Zhang   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
11315fbaff96SJunchao Zhang   PetscFunctionReturn(0);
11325fbaff96SJunchao Zhang }
11335fbaff96SJunchao Zhang 
11349371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) {
11358f7e8f9dSMark Adams   PetscFunctionBegin;
11369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
11379566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
11388f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11398f7e8f9dSMark Adams   PetscFunctionReturn(0);
11408f7e8f9dSMark Adams }
11418f7e8f9dSMark Adams 
11429371c9d4SSatish Balay static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) {
1143076ba34aSJunchao Zhang   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1144076ba34aSJunchao Zhang 
11458c3ff71bSJunchao Zhang   PetscFunctionBegin;
1146076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11476f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11486f3d89d0SStefano Zampini 
11498c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11508c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11518c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1152a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1153f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1154a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1155076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11568c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11578c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11588c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11598c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11608c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11618c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1162076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11630ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1164152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1165f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1166076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1167076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1168076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1169076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1170076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1171076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
11727ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
117342550becSJunchao Zhang 
11749566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
11759566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1176076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1177076ba34aSJunchao Zhang }
1178076ba34aSJunchao Zhang 
11799371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) {
1180076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1181076ba34aSJunchao Zhang   PetscInt    i, m, n;
1182076ba34aSJunchao Zhang 
1183076ba34aSJunchao Zhang   PetscFunctionBegin;
11845f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1185076ba34aSJunchao Zhang 
1186076ba34aSJunchao Zhang   m = akok->nrows();
1187076ba34aSJunchao Zhang   n = akok->ncols();
11889566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A, m, n, m, n));
11899566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1190076ba34aSJunchao Zhang 
1191076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
11929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1193076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ *)(A)->data;
1194076ba34aSJunchao Zhang 
1195076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1196076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1197076ba34aSJunchao Zhang 
1198076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1199076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1200076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1201076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1202076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1203076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1204076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1205076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1206076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1207076ba34aSJunchao Zhang 
12089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->imax));
12099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aseq->ilen));
1210ad540459SPierre Jolivet   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1211076ba34aSJunchao Zhang 
1212076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1213076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1214ff751488SJunchao Zhang   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
12159566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
12169566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1217076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1218076ba34aSJunchao Zhang }
1219076ba34aSJunchao Zhang 
1220076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1221076ba34aSJunchao Zhang 
1222076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1223076ba34aSJunchao Zhang  */
12249371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) {
1225076ba34aSJunchao Zhang   PetscFunctionBegin;
12269566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
12279566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
12288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12298c3ff71bSJunchao Zhang }
12308c3ff71bSJunchao Zhang 
12318c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1232152b3e56SJunchao Zhang /*@C
12338c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12348c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12358c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12368c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12378c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12388c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12398c3ff71bSJunchao Zhang 
12408c3ff71bSJunchao Zhang    Collective
12418c3ff71bSJunchao Zhang 
12428c3ff71bSJunchao Zhang    Input Parameters:
12438c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12448c3ff71bSJunchao Zhang .  m - number of rows
12458c3ff71bSJunchao Zhang .  n - number of columns
12468c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12478c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12488c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12498c3ff71bSJunchao Zhang 
12508c3ff71bSJunchao Zhang    Output Parameter:
12518c3ff71bSJunchao Zhang .  A - the matrix
12528c3ff71bSJunchao Zhang 
12538c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12548c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12558c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12568c3ff71bSJunchao Zhang 
12578c3ff71bSJunchao Zhang    Notes:
12588c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12598c3ff71bSJunchao Zhang 
12608c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12618c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12628c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12638c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12648c3ff71bSJunchao Zhang 
12658c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12668c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12678c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12688c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12698c3ff71bSJunchao Zhang 
12708c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12718c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12728c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12738c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12748c3ff71bSJunchao Zhang 
12758c3ff71bSJunchao Zhang    Level: intermediate
12768c3ff71bSJunchao Zhang 
1277db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
12788c3ff71bSJunchao Zhang @*/
12799371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
12808c3ff71bSJunchao Zhang   PetscFunctionBegin;
12819566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
12829566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
12839566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
12849566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
12859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
12868c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12878c3ff71bSJunchao Zhang }
1288930e68a5SMark Adams 
12898f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12908f7e8f9dSMark Adams //
129146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12928f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12938f7e8f9dSMark Adams //
12949371c9d4SSatish Balay static PetscErrorCode                     MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) {
12958f7e8f9dSMark Adams                       Mat_SeqAIJ       *b      = (Mat_SeqAIJ *)B->data;
12968f7e8f9dSMark Adams                       Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
12978f7e8f9dSMark Adams                       IS                isrow = b->row, isicol = b->icol;
12988f7e8f9dSMark Adams                       const PetscInt   *r_h, *ic_h;
1299300d22a6SJunchao Zhang                       const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1300076ba34aSJunchao Zhang                       const PetscScalar *aa_d = aijkok->a_dual.view_device().data();
1301076ba34aSJunchao Zhang                       PetscScalar       *ba_d = baijkok->a_dual.view_device().data();
13028f7e8f9dSMark Adams                       PetscBool          row_identity, col_identity;
130346804e07SMark Adams                       PetscInt           nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used
1304930e68a5SMark Adams 
1305930e68a5SMark Adams                       PetscFunctionBegin;
13062c71b3e2SJacob Faibussowitsch                       PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n);
1307b94d7dedSBarry Smith                       PetscCall(MatIsStructurallySymmetric(A, &row_identity));
13082c71b3e2SJacob Faibussowitsch                       PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported");
13099566063dSJacob Faibussowitsch                       PetscCall(ISGetIndices(isrow, &r_h));
13109566063dSJacob Faibussowitsch                       PetscCall(ISGetIndices(isicol, &ic_h));
13119566063dSJacob Faibussowitsch                       PetscCall(ISGetSize(isicol, &nc));
13129566063dSJacob Faibussowitsch                       PetscCall(PetscLogGpuTimeBegin());
13139566063dSJacob Faibussowitsch                       PetscCall(MatSeqAIJKokkosSyncDevice(A));
13148f7e8f9dSMark Adams                       {
13158f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13168f7e8f9dSMark Adams     using scr_mem_t    = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13178f7e8f9dSMark Adams     using sizet_scr_t  = Kokkos::View<size_t, scr_mem_t>;
13188f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13198f7e8f9dSMark Adams     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n);
13208f7e8f9dSMark Adams     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_r_k("r", n);
13218f7e8f9dSMark Adams     const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc);
13228f7e8f9dSMark Adams     Kokkos::View<PetscInt *, Kokkos::LayoutLeft>                                                                         d_ic_k("ic", nc);
13238f7e8f9dSMark Adams     size_t                                                                                                               flops_h = 0.0;
13248f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace>                                                                              h_flops_k(&flops_h);
13258f7e8f9dSMark Adams     Kokkos::View<size_t>                                                                                                 d_flops_k("flops");
13268f7e8f9dSMark Adams     const int                                                                                                            conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13278f7e8f9dSMark Adams     const int                                                                                                            nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13288f7e8f9dSMark Adams     Kokkos::deep_copy(d_flops_k, h_flops_k);
13298f7e8f9dSMark Adams     Kokkos::deep_copy(d_r_k, h_r_k);
13308f7e8f9dSMark Adams     Kokkos::deep_copy(d_ic_k, h_ic_k);
13318f7e8f9dSMark Adams     // Fill A --> fact
13329371c9d4SSatish Balay     Kokkos::parallel_for(
13339371c9d4SSatish Balay       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) {
1334042217e8SBarry Smith         const PetscInt  field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA
13358f7e8f9dSMark Adams         const PetscInt  nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i);
13368f7e8f9dSMark Adams         const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data();
13378f7e8f9dSMark Adams         // zero rows of B
13388f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
13398f7e8f9dSMark Adams           PetscInt     nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag
13408f7e8f9dSMark Adams           PetscScalar *baL = ba_d + bi_d[rowb];
13418f7e8f9dSMark Adams           PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1;
13428f7e8f9dSMark Adams           /* zero (unfactored row) */
13438f7e8f9dSMark Adams           for (int j = 0; j < nzbL; j++) baL[j] = 0;
13448f7e8f9dSMark Adams           for (int j = 0; j < nzbU; j++) baU[j] = 0;
13458f7e8f9dSMark Adams         });
13468f7e8f9dSMark Adams         // copy A into B
13478f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) {
13488f7e8f9dSMark Adams           PetscInt           rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa];
13498f7e8f9dSMark Adams           const PetscScalar *av    = aa_d + ai_d[rowa];
13508f7e8f9dSMark Adams           const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13518f7e8f9dSMark Adams           /* load in initial (unfactored row) */
13528f7e8f9dSMark Adams           for (int j = 0; j < nza; j++) {
13538f7e8f9dSMark Adams             PetscInt    colb = ic[ajtmp[j]];
13548f7e8f9dSMark Adams             PetscScalar vala = av[j];
13558f7e8f9dSMark Adams             if (colb == rowb) {
13568f7e8f9dSMark Adams               *(ba_d + bdiag_d[rowb]) = vala;
13578f7e8f9dSMark Adams             } else {
13588f7e8f9dSMark Adams               const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
13598f7e8f9dSMark Adams               PetscScalar    *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]);
13608f7e8f9dSMark Adams               PetscInt        nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0;
13618f7e8f9dSMark Adams               for (int j = 0; j < nz; j++) {
13628f7e8f9dSMark Adams                 if (pbj[j] == colb) {
13638f7e8f9dSMark Adams                   pba[j] = vala;
13648f7e8f9dSMark Adams                   set++;
13658f7e8f9dSMark Adams                   break;
13668f7e8f9dSMark Adams                 }
13678f7e8f9dSMark Adams               }
13688f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL)
13698f7e8f9dSMark Adams               if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13708f1da0b2SJunchao Zhang #endif
13718f7e8f9dSMark Adams             }
13728f7e8f9dSMark Adams           }
13738f7e8f9dSMark Adams         });
13748f7e8f9dSMark Adams       });
13758f7e8f9dSMark Adams     Kokkos::fence();
1376930e68a5SMark Adams 
13779371c9d4SSatish Balay     Kokkos::parallel_for(
13789371c9d4SSatish Balay       Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) {
13798f7e8f9dSMark Adams         sizet_scr_t    colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13808f7e8f9dSMark Adams         scalar_scr_t   L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13818f7e8f9dSMark Adams         sizet_scr_t    flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1382042217e8SBarry Smith         const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA
13838f7e8f9dSMark Adams         const PetscInt start = field * nloc, end = start + nloc;
13848f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13858f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13868f7e8f9dSMark Adams         for (int ii = start; ii < end - 1; ii++) {
13878f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end)
13888f7e8f9dSMark Adams           const PetscScalar *baUi    = ba_d + bdiag_d[ii + 1] + 1;                                          // vector of data  U(i,i+1:end)
13898f7e8f9dSMark Adams           const PetscInt     nUi_its = nzUi / Ni + !!(nzUi % Ni);
13908f7e8f9dSMark Adams           const PetscScalar  Bii     = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13918f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) {
13928f7e8f9dSMark Adams             PetscInt kIdx = j * Ni + field_block_idx;
13939371c9d4SSatish Balay             if (kIdx >= nzUi) /* void */
13949371c9d4SSatish Balay               ;
13958f7e8f9dSMark Adams             else {
13968f7e8f9dSMark Adams               const PetscInt  myk = bjUi[kIdx];                // assume symmetric structure, need a transposed meta-data here in general
13978f7e8f9dSMark Adams               const PetscInt *pjL = bj_d + bi_d[myk];          // look for L(myk,ii) in start of row
13988f7e8f9dSMark Adams               const PetscInt  nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:)
13998f7e8f9dSMark Adams               size_t          st_idx;
14008f7e8f9dSMark Adams               // find and do L(k,i) = A(:k,i) / A(i,i)
14018f7e8f9dSMark Adams               Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14028f7e8f9dSMark Adams               // get column, there has got to be a better way
14039371c9d4SSatish Balay               Kokkos::parallel_reduce(
14049371c9d4SSatish Balay                 Kokkos::ThreadVectorRange(team, nzL),
14059371c9d4SSatish Balay                 [&](const int &j, size_t &idx) {
14068f7e8f9dSMark Adams                   if (pjL[j] == ii) {
14078f7e8f9dSMark Adams                     PetscScalar *pLki = ba_d + bi_d[myk] + j;
14088f7e8f9dSMark Adams                     idx               = j;           // output
14098f7e8f9dSMark Adams                     *pLki             = *pLki / Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14108f7e8f9dSMark Adams                   }
14119371c9d4SSatish Balay                 },
14129371c9d4SSatish Balay                 st_idx);
14139371c9d4SSatish Balay               Kokkos::single(Kokkos::PerThread(team), [=]() {
14149371c9d4SSatish Balay                 colkIdx() = st_idx;
14159371c9d4SSatish Balay                 L_ki()    = *(ba_d + bi_d[myk] + st_idx);
14169371c9d4SSatish Balay               });
14178f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
141899551766SMark Adams               if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register
141999551766SMark Adams #endif
142099551766SMark Adams               // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14218f7e8f9dSMark Adams               // U(i+1,:end)
14228f7e8f9dSMark Adams               Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U)
14238f7e8f9dSMark Adams                 PetscScalar Uij = baUi[uiIdx];
14248f7e8f9dSMark Adams                 PetscInt    col = bjUi[uiIdx];
14258f7e8f9dSMark Adams                 if (col == myk) {
14268f7e8f9dSMark Adams                   // A_kk = A_kk - L_ki * U_ij(k)
14278f7e8f9dSMark Adams                   PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14288f7e8f9dSMark Adams                   *Akkv             = *Akkv - L_ki() * Uij;  // UiK
14298f7e8f9dSMark Adams                 } else {
14308f7e8f9dSMark Adams                   PetscScalar    *start, *end, *pAkjv = NULL;
14318f7e8f9dSMark Adams                   PetscInt        high, low;
14328f7e8f9dSMark Adams                   const PetscInt *startj;
14338f7e8f9dSMark Adams                   if (col < myk) { // L
14348f7e8f9dSMark Adams                     PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14358f7e8f9dSMark Adams                     PetscInt     idx  = (pLki + 1) - (ba_d + bi_d[myk]); // index into row
14368f7e8f9dSMark Adams                     start             = pLki + 1;                        // start at pLki+1, A22(myk,1)
14378f7e8f9dSMark Adams                     startj            = bj_d + bi_d[myk] + idx;
14388f7e8f9dSMark Adams                     end               = ba_d + bi_d[myk + 1];
14398f7e8f9dSMark Adams                   } else {
14408f7e8f9dSMark Adams                     PetscInt idx = bdiag_d[myk + 1] + 1;
14418f7e8f9dSMark Adams                     start        = ba_d + idx;
14428f7e8f9dSMark Adams                     startj       = bj_d + idx;
14438f7e8f9dSMark Adams                     end          = ba_d + bdiag_d[myk];
14448f7e8f9dSMark Adams                   }
14458f7e8f9dSMark Adams                   // search for 'col', use bisection search - TODO
14468f7e8f9dSMark Adams                   low  = 0;
14478f7e8f9dSMark Adams                   high = (PetscInt)(end - start);
14488f7e8f9dSMark Adams                   while (high - low > 5) {
14498f7e8f9dSMark Adams                     int t = (low + high) / 2;
14508f7e8f9dSMark Adams                     if (startj[t] > col) high = t;
14518f7e8f9dSMark Adams                     else low = t;
14528f7e8f9dSMark Adams                   }
14538f7e8f9dSMark Adams                   for (pAkjv = start + low; pAkjv < start + high; pAkjv++) {
14548f7e8f9dSMark Adams                     if (startj[pAkjv - start] == col) break;
14558f7e8f9dSMark Adams                   }
14568f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
145799551766SMark Adams                   if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register
145899551766SMark Adams #endif
14598f7e8f9dSMark Adams                   *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14608f7e8f9dSMark Adams                 }
14618f7e8f9dSMark Adams               });
14628f7e8f9dSMark Adams             }
14638f7e8f9dSMark Adams           });
14648f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14658f7e8f9dSMark Adams           if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); });
14668f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14679371c9d4SSatish Balay         Kokkos::single(Kokkos::PerTeam(team), [=]() {
14689371c9d4SSatish Balay           Kokkos::atomic_add(&d_flops_k(), flops());
14699371c9d4SSatish Balay           flops() = 0;
14709371c9d4SSatish Balay         });
14718f7e8f9dSMark Adams       });
14728f7e8f9dSMark Adams     Kokkos::fence();
14738f7e8f9dSMark Adams     Kokkos::deep_copy(h_flops_k, d_flops_k);
14749566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
14759371c9d4SSatish Balay     Kokkos::parallel_for(
14769371c9d4SSatish Balay       Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) {
14778f7e8f9dSMark Adams         const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni;                            //, field_offset = lg_rank%Ni;
14788f7e8f9dSMark Adams         const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters
14798f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14808f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) {
14818f7e8f9dSMark Adams           int i = start + outer_index * Ni + lg_rank % Ni;
14828f7e8f9dSMark Adams           if (i < end) {
14838f7e8f9dSMark Adams             PetscScalar *pv = ba_d + bdiag_d[i];
14848f7e8f9dSMark Adams             *pv             = 1.0 / (*pv);
14858f7e8f9dSMark Adams           }
14868f7e8f9dSMark Adams         });
14878f7e8f9dSMark Adams       });
14888f7e8f9dSMark Adams   }
14899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14909566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic_h));
14919566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r_h));
14928f7e8f9dSMark Adams 
14939566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
14949566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isicol, &col_identity));
14958f7e8f9dSMark Adams   if (b->inode.size) {
14968f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14978f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14988f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14998f7e8f9dSMark Adams   } else {
15008f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15018f7e8f9dSMark Adams   }
15028f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(B));          // solve on CPU
15048f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15058f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15068f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15078f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15088f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15098f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15108f7e8f9dSMark Adams 
1511930e68a5SMark Adams   PetscFunctionReturn(0);
1512930e68a5SMark Adams }
1513930e68a5SMark Adams 
15149371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1515930e68a5SMark Adams   PetscFunctionBegin;
15169566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
151786a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
151886a27549SJunchao Zhang   PetscFunctionReturn(0);
151986a27549SJunchao Zhang }
152086a27549SJunchao Zhang 
15219371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) {
152286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
152386a27549SJunchao Zhang 
152486a27549SJunchao Zhang   PetscFunctionBegin;
152586a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
152686a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d);
152786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d);
152886a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
152986a27549SJunchao Zhang   }
153086a27549SJunchao Zhang   PetscFunctionReturn(0);
153186a27549SJunchao Zhang }
153286a27549SJunchao Zhang 
153386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
15349371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) {
153586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1536076ba34aSJunchao Zhang   MatColIdxType               n       = A->rmap->n;
153786a27549SJunchao Zhang 
153886a27549SJunchao Zhang   PetscFunctionBegin;
153986a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
154086a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1541076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1);
154286a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */
1543076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0));
1544076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0));
154586a27549SJunchao Zhang 
15469371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d,
154786a27549SJunchao Zhang                                                                                                                                                                                                               factors->iLt_d, factors->jLt_d, factors->aLt_d);
154886a27549SJunchao Zhang 
154986a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
155086a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
155186a27549SJunchao Zhang     */
15529371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d);
155386a27549SJunchao Zhang 
155486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d);
155586a27549SJunchao Zhang 
155686a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1557076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1);
155886a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */
1559076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0));
1560076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0));
156186a27549SJunchao Zhang 
15629371c9d4SSatish Balay     transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d,
156386a27549SJunchao Zhang                                                                                                                                                                                                               factors->iUt_d, factors->jUt_d, factors->aUt_d);
156486a27549SJunchao Zhang 
156586a27549SJunchao Zhang     /* Sort indices. See comments above */
15669371c9d4SSatish Balay     sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d);
156786a27549SJunchao Zhang 
156886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d);
156986a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
157086a27549SJunchao Zhang   }
157186a27549SJunchao Zhang   PetscFunctionReturn(0);
157286a27549SJunchao Zhang }
157386a27549SJunchao Zhang 
157486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
15759371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) {
157686a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
157786a27549SJunchao Zhang   PetscScalarKokkosView       xv;
157886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
157986a27549SJunchao Zhang 
158086a27549SJunchao Zhang   PetscFunctionBegin;
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
15829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
15839566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
15849566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
158586a27549SJunchao Zhang   /* Solve L tmpv = b */
15869566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector));
158786a27549SJunchao Zhang   /* Solve Ux = tmpv */
15889566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv));
15899566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
15909566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
15919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
159286a27549SJunchao Zhang   PetscFunctionReturn(0);
159386a27549SJunchao Zhang }
159486a27549SJunchao Zhang 
1595076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
15969371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) {
159786a27549SJunchao Zhang   ConstPetscScalarKokkosView  bv;
159886a27549SJunchao Zhang   PetscScalarKokkosView       xv;
159986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
160086a27549SJunchao Zhang 
160186a27549SJunchao Zhang   PetscFunctionBegin;
16029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
16049566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b, &bv));
16059566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x, &xv));
160686a27549SJunchao Zhang   /* Solve U^T tmpv = b */
160786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector);
160886a27549SJunchao Zhang 
160986a27549SJunchao Zhang   /* Solve L^T x = tmpv */
161086a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv);
16119566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b, &bv));
16129566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
16139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
161486a27549SJunchao Zhang   PetscFunctionReturn(0);
161586a27549SJunchao Zhang }
161686a27549SJunchao Zhang 
16179371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) {
161886a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
161986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
162086a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
162186a27549SJunchao Zhang 
162286a27549SJunchao Zhang   PetscFunctionBegin;
16239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1625076ba34aSJunchao Zhang 
1626076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1627076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1628076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1629076ba34aSJunchao Zhang 
1630076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d);
163186a27549SJunchao Zhang 
163286a27549SJunchao Zhang   B->assembled              = PETSC_TRUE;
163386a27549SJunchao Zhang   B->preallocated           = PETSC_TRUE;
163486a27549SJunchao Zhang   B->ops->solve             = MatSolve_SeqAIJKokkos;
163586a27549SJunchao Zhang   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos;
163686a27549SJunchao Zhang   B->ops->matsolve          = NULL;
163786a27549SJunchao Zhang   B->ops->matsolvetranspose = NULL;
163886a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
163986a27549SJunchao Zhang 
164086a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
164186a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
164286a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1643eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
16449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
164586a27549SJunchao Zhang   PetscFunctionReturn(0);
164686a27549SJunchao Zhang }
164786a27549SJunchao Zhang 
16489371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
164986a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
165086a27549SJunchao Zhang   Mat_SeqAIJ                 *b;
165186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
165286a27549SJunchao Zhang   PetscInt                    fill_lev = info->levels;
165386a27549SJunchao Zhang   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
165486a27549SJunchao Zhang   PetscInt                    n        = A->rmap->n;
165586a27549SJunchao Zhang 
165686a27549SJunchao Zhang   PetscFunctionBegin;
16579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
165886a27549SJunchao Zhang   /* Rebuild factors */
16599371c9d4SSatish Balay   if (factors) {
16609371c9d4SSatish Balay     factors->Destroy();
16619371c9d4SSatish Balay   } /* Destroy the old if it exists */
16629371c9d4SSatish Balay   else {
16639371c9d4SSatish Balay     B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);
16649371c9d4SSatish Balay   }
166586a27549SJunchao Zhang 
166686a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
166786a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
166886a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
166986a27549SJunchao Zhang 
167086a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
167186a27549SJunchao Zhang 
167286a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
167386a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
167486a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d, n + 1);
167586a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
167686a27549SJunchao Zhang 
167786a27549SJunchao Zhang   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
1678076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1679076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1680076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d);
168186a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
168286a27549SJunchao Zhang 
168386a27549SJunchao Zhang   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
168486a27549SJunchao Zhang   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
168586a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
168686a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
168786a27549SJunchao Zhang 
168886a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
168986a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
169086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
169186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
169286a27549SJunchao Zhang #else
169386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
169486a27549SJunchao Zhang #endif
169586a27549SJunchao Zhang 
169686a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
169786a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
169886a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
169986a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
170086a27549SJunchao Zhang 
170186a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
17029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
170386a27549SJunchao Zhang   b     = (Mat_SeqAIJ *)B->data;
170486a27549SJunchao Zhang   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
170586a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
170686a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA);
170786a27549SJunchao Zhang 
170886a27549SJunchao Zhang   B->offloadmask          = PETSC_OFFLOAD_GPU;
170986a27549SJunchao Zhang   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos;
1710930e68a5SMark Adams   PetscFunctionReturn(0);
1711930e68a5SMark Adams }
1712930e68a5SMark Adams 
17139371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
17148f7e8f9dSMark Adams   Mat_SeqAIJ    *b     = (Mat_SeqAIJ *)B->data;
17158f7e8f9dSMark Adams   const PetscInt nrows = A->rmap->n;
1716930e68a5SMark Adams 
17178f7e8f9dSMark Adams   PetscFunctionBegin;
17189566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
17198f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17208f7e8f9dSMark Adams   // move B data into Kokkos
17219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
17229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
17238f7e8f9dSMark Adams   {
17248f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
1725300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17268f7e8f9dSMark Adams       const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1);
1727300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag));
1728300d22a6SJunchao Zhang       Kokkos::deep_copy(baijkok->diag_d, h_diag);
17298f7e8f9dSMark Adams     }
17308f7e8f9dSMark Adams   }
17318f7e8f9dSMark Adams   PetscFunctionReturn(0);
17328f7e8f9dSMark Adams }
17338f7e8f9dSMark Adams 
17349371c9d4SSatish Balay static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) {
1735930e68a5SMark Adams   PetscFunctionBegin;
1736930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1737930e68a5SMark Adams   PetscFunctionReturn(0);
1738930e68a5SMark Adams }
1739930e68a5SMark Adams 
17409371c9d4SSatish Balay static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) {
17418f7e8f9dSMark Adams   PetscFunctionBegin;
17428f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17438f7e8f9dSMark Adams   PetscFunctionReturn(0);
17448f7e8f9dSMark Adams }
17458f7e8f9dSMark Adams 
1746930e68a5SMark Adams /*MC
174786a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
174886a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1749930e68a5SMark Adams 
1750930e68a5SMark Adams   Level: beginner
1751930e68a5SMark Adams 
1752db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1753930e68a5SMark Adams M*/
175486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1755930e68a5SMark Adams {
1756930e68a5SMark Adams   PetscInt n = A->rmap->n;
1757930e68a5SMark Adams 
1758930e68a5SMark Adams   PetscFunctionBegin;
17599566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
17609566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
1761930e68a5SMark Adams   (*B)->factortype = ftype;
17629566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
17639566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
1764930e68a5SMark Adams 
17658f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17669566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
176786a27549SJunchao Zhang     (*B)->canuseordering        = PETSC_TRUE;
176886a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
176986a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
17709566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
177186a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
177286a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
177398921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1774930e68a5SMark Adams 
17759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
17769566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos));
1777930e68a5SMark Adams   PetscFunctionReturn(0);
1778930e68a5SMark Adams }
17798f7e8f9dSMark Adams 
17809371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) {
17818f7e8f9dSMark Adams   PetscInt n = A->rmap->n;
17828f7e8f9dSMark Adams 
17838f7e8f9dSMark Adams   PetscFunctionBegin;
17849566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
17859566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B, n, n, n, n));
17868f7e8f9dSMark Adams   (*B)->factortype     = ftype;
1787f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17889566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
17899566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
17908f7e8f9dSMark Adams 
17918f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17929566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
17938f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17948f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types");
17958f7e8f9dSMark Adams 
17969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
17979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device));
17988f7e8f9dSMark Adams   PetscFunctionReturn(0);
17998f7e8f9dSMark Adams }
180086a27549SJunchao Zhang 
18019371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) {
180286a27549SJunchao Zhang   PetscFunctionBegin;
18039566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
18049566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
18059566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device));
180686a27549SJunchao Zhang   PetscFunctionReturn(0);
180786a27549SJunchao Zhang }
180886a27549SJunchao Zhang 
1809076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
18109371c9d4SSatish Balay PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) {
1811076ba34aSJunchao Zhang   const auto        &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map);
1812076ba34aSJunchao Zhang   const auto        &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries);
1813076ba34aSJunchao Zhang   const auto        &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values);
1814076ba34aSJunchao Zhang   const PetscInt    *i  = iv.data();
1815076ba34aSJunchao Zhang   const PetscInt    *j  = jv.data();
1816076ba34aSJunchao Zhang   const PetscScalar *a  = av.data();
1817076ba34aSJunchao Zhang   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
1818076ba34aSJunchao Zhang 
1819076ba34aSJunchao Zhang   PetscFunctionBegin;
18209566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
1821076ba34aSJunchao Zhang   for (PetscInt k = 0; k < m; k++) {
18229566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
182348a46eb9SPierre Jolivet     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
18249566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
1825076ba34aSJunchao Zhang   }
1826076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1827076ba34aSJunchao Zhang }
1828