xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 5fbaff9626e476811526397c1aa2ba76a87741c8)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
28076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
29076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang   PetscFunctionBegin;
32076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
339566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
34076ba34aSJunchao Zhang 
35076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37076ba34aSJunchao Zhang 
38076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
39076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
41076ba34aSJunchao Zhang   */
42076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43076ba34aSJunchao Zhang     delete aijkok;
44076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45076ba34aSJunchao Zhang     A->spptr = aijkok;
46076ba34aSJunchao Zhang   }
47076ba34aSJunchao Zhang 
48a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
49a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50a587d139SMark   }
518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
528c3ff71bSJunchao Zhang }
538c3ff71bSJunchao Zhang 
5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */
55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
568c3ff71bSJunchao Zhang {
578c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
588c3ff71bSJunchao Zhang 
598c3ff71bSJunchao Zhang   PetscFunctionBegin;
605f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
615f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
63076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
64580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
668c3ff71bSJunchao Zhang   }
678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
688c3ff71bSJunchao Zhang }
698c3ff71bSJunchao Zhang 
70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7286a27549SJunchao Zhang {
7386a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7486a27549SJunchao Zhang 
7586a27549SJunchao Zhang   PetscFunctionBegin;
765f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
7786a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
7886a27549SJunchao Zhang   aijkok->a_dual.modify_device();
7986a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8086a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
829566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
8386a27549SJunchao Zhang   PetscFunctionReturn(0);
8486a27549SJunchao Zhang }
8586a27549SJunchao Zhang 
86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87f0cf5187SStefano Zampini {
88f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89f0cf5187SStefano Zampini 
90f0cf5187SStefano Zampini   PetscFunctionBegin;
91f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9286a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
935f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
945f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
96f0cf5187SStefano Zampini   PetscFunctionReturn(0);
97f0cf5187SStefano Zampini }
98f0cf5187SStefano Zampini 
99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100f0cf5187SStefano Zampini {
101076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini   PetscFunctionBegin;
104076ba34aSJunchao Zhang   if (aijkok) {
105076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
106076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
107076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
108076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109076ba34aSJunchao Zhang   }
110076ba34aSJunchao Zhang   PetscFunctionReturn(0);
111076ba34aSJunchao Zhang }
112076ba34aSJunchao Zhang 
113076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114076ba34aSJunchao Zhang {
115076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116076ba34aSJunchao Zhang 
117076ba34aSJunchao Zhang   PetscFunctionBegin;
118076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
119076ba34aSJunchao Zhang   PetscFunctionReturn(0);
120076ba34aSJunchao Zhang }
121076ba34aSJunchao Zhang 
122076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123076ba34aSJunchao Zhang {
124076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125076ba34aSJunchao Zhang 
126076ba34aSJunchao Zhang   PetscFunctionBegin;
1272328674fSJunchao Zhang   if (aijkok) {
128076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
129076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1302328674fSJunchao Zhang   } else {
1312328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1322328674fSJunchao Zhang   }
133076ba34aSJunchao Zhang   PetscFunctionReturn(0);
134076ba34aSJunchao Zhang }
135076ba34aSJunchao Zhang 
136076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137076ba34aSJunchao Zhang {
138076ba34aSJunchao Zhang   PetscFunctionBegin;
139076ba34aSJunchao Zhang   *array = NULL;
140076ba34aSJunchao Zhang   PetscFunctionReturn(0);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
143076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144076ba34aSJunchao Zhang {
145076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang   PetscFunctionBegin;
1482328674fSJunchao Zhang   if (aijkok) {
149076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1502328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1512328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1522328674fSJunchao Zhang   }
153076ba34aSJunchao Zhang   PetscFunctionReturn(0);
154076ba34aSJunchao Zhang }
155076ba34aSJunchao Zhang 
156076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157076ba34aSJunchao Zhang {
158076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159076ba34aSJunchao Zhang 
160076ba34aSJunchao Zhang   PetscFunctionBegin;
1612328674fSJunchao Zhang   if (aijkok) {
162076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
163076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1642328674fSJunchao Zhang   }
165f0cf5187SStefano Zampini   PetscFunctionReturn(0);
166f0cf5187SStefano Zampini }
167f0cf5187SStefano Zampini 
1687ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1697ee59b9bSJunchao Zhang {
1707ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1717ee59b9bSJunchao Zhang 
1727ee59b9bSJunchao Zhang   PetscFunctionBegin;
1737ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
1747ee59b9bSJunchao Zhang 
1757ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1767ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
1777ee59b9bSJunchao Zhang   if (a) {
1787ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
1797ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
1807ee59b9bSJunchao Zhang   }
1817ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
1827ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
1837ee59b9bSJunchao Zhang }
1847ee59b9bSJunchao Zhang 
185a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
187a587d139SMark {
188042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
190a587d139SMark 
191a587d139SMark   PetscFunctionBegin;
1925f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
193152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
194a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
195a587d139SMark   PetscFunctionReturn(0);
196a587d139SMark }
197a587d139SMark 
198a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
199042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
200a587d139SMark {
201042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
202a587d139SMark 
203a587d139SMark   PetscFunctionBegin;
204a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
205a587d139SMark     *d_mat = aijkok->device_mat_d.data();
206a587d139SMark   } else {
2079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
208a587d139SMark     *d_mat  = NULL;
209a587d139SMark   }
210a587d139SMark   PetscFunctionReturn(0);
211a587d139SMark }
212076ba34aSJunchao Zhang 
213076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
214076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
215152b3e56SJunchao Zhang {
216152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
217152b3e56SJunchao Zhang 
218152b3e56SJunchao Zhang   PetscFunctionBegin;
2195f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
220076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
221076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
2229566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
2239566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
2249566063dSJacob Faibussowitsch     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
22586a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
226076ba34aSJunchao Zhang   }
227076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
228152b3e56SJunchao Zhang   PetscFunctionReturn(0);
229152b3e56SJunchao Zhang }
230152b3e56SJunchao Zhang 
231076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
232076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
233152b3e56SJunchao Zhang {
234152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
235152b3e56SJunchao Zhang 
236152b3e56SJunchao Zhang   PetscFunctionBegin;
2379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2385f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
239076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
2409566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
2419566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
2429566063dSJacob Faibussowitsch     PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
243076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
244076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
245076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
246076ba34aSJunchao Zhang    #endif
24786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
248076ba34aSJunchao Zhang   }
249076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
2509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
251152b3e56SJunchao Zhang   PetscFunctionReturn(0);
252152b3e56SJunchao Zhang }
253a587d139SMark 
2548c3ff71bSJunchao Zhang /* y = A x */
2558c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2568c3ff71bSJunchao Zhang {
2578c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
258152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
259152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2608c3ff71bSJunchao Zhang 
2618c3ff71bSJunchao Zhang   PetscFunctionBegin;
2629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2649566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
2659566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
2668c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
267152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
2689566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
2699566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
270076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
2719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
2729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2738c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2748c3ff71bSJunchao Zhang }
2758c3ff71bSJunchao Zhang 
2768c3ff71bSJunchao Zhang /* y = A^T x */
2778c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2788c3ff71bSJunchao Zhang {
2798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
280152b3e56SJunchao Zhang   const char                       *mode;
281152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
282152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
283076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2848c3ff71bSJunchao Zhang 
2858c3ff71bSJunchao Zhang   PetscFunctionBegin;
2869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2889566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
2899566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
290152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
2919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
292152b3e56SJunchao Zhang     mode = "N";
293152b3e56SJunchao Zhang   } else {
294076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
295076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
296152b3e56SJunchao Zhang     mode = "T";
297152b3e56SJunchao Zhang   }
298076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
2999566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3009566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3048c3ff71bSJunchao Zhang }
3058c3ff71bSJunchao Zhang 
3068c3ff71bSJunchao Zhang /* y = A^H x */
3078c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
3088c3ff71bSJunchao Zhang {
3098c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
310152b3e56SJunchao Zhang   const char                       *mode;
311152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
312152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
313076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3148c3ff71bSJunchao Zhang 
3158c3ff71bSJunchao Zhang   PetscFunctionBegin;
3169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3189566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3199566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
320152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
322152b3e56SJunchao Zhang     mode = "N";
323152b3e56SJunchao Zhang   } else {
324076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
326152b3e56SJunchao Zhang     mode = "C";
327152b3e56SJunchao Zhang   }
328076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
3299566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3309566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3319566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3338c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3348c3ff71bSJunchao Zhang }
3358c3ff71bSJunchao Zhang 
3368c3ff71bSJunchao Zhang /* z = A x + y */
3378c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3388c3ff71bSJunchao Zhang {
3398c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
340152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
341152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3428c3ff71bSJunchao Zhang 
3438c3ff71bSJunchao Zhang   PetscFunctionBegin;
3449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3469566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3479566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3489566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3498c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3508c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
351152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
3529566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3539566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
3549566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
3559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
3569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3578c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3588c3ff71bSJunchao Zhang }
3598c3ff71bSJunchao Zhang 
3608c3ff71bSJunchao Zhang /* z = A^T x + y */
3618c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3628c3ff71bSJunchao Zhang {
3638c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
364152b3e56SJunchao Zhang   const char                       *mode;
365152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
366152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
367076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3688c3ff71bSJunchao Zhang 
3698c3ff71bSJunchao Zhang   PetscFunctionBegin;
3709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3729566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3739566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3749566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3758c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
376152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3779566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
378152b3e56SJunchao Zhang     mode = "N";
379152b3e56SJunchao Zhang   } else {
380076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
381076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
382152b3e56SJunchao Zhang     mode = "T";
383152b3e56SJunchao Zhang   }
384076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
3859566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3869566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
3879566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
3889566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3918c3ff71bSJunchao Zhang }
3928c3ff71bSJunchao Zhang 
3938c3ff71bSJunchao Zhang /* z = A^H x + y */
3948c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3958c3ff71bSJunchao Zhang {
3968c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
397152b3e56SJunchao Zhang   const char                       *mode;
398152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
399152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
400076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang   PetscFunctionBegin;
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4059566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
4069566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
4079566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
4088c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
409152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
411152b3e56SJunchao Zhang     mode = "N";
412152b3e56SJunchao Zhang   } else {
413076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
414076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
415152b3e56SJunchao Zhang     mode = "C";
416152b3e56SJunchao Zhang   }
417076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
4189566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
4199566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
4209566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
4219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
4229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
423152b3e56SJunchao Zhang   PetscFunctionReturn(0);
424152b3e56SJunchao Zhang }
425152b3e56SJunchao Zhang 
426152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
427152b3e56SJunchao Zhang {
428152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
429152b3e56SJunchao Zhang 
430152b3e56SJunchao Zhang   PetscFunctionBegin;
431152b3e56SJunchao Zhang   switch (op) {
432152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
433152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
4349566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
435152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
436152b3e56SJunchao Zhang       break;
437152b3e56SJunchao Zhang     default:
4389566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
439152b3e56SJunchao Zhang       break;
440152b3e56SJunchao Zhang   }
4418c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4428c3ff71bSJunchao Zhang }
4438c3ff71bSJunchao Zhang 
444076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4453d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4468c3ff71bSJunchao Zhang {
447076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4488c3ff71bSJunchao Zhang 
4498c3ff71bSJunchao Zhang   PetscFunctionBegin;
4509566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
451076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
4529566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
4538c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
4549566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
455076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4565f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
4579566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
4589566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
4599566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
4609566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
461076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
462394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4635f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
464076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4658c3ff71bSJunchao Zhang     }
466076ba34aSJunchao Zhang   }
4678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4688c3ff71bSJunchao Zhang }
4698c3ff71bSJunchao Zhang 
470076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
471076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
472076ba34aSJunchao Zhang  */
473076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4748c3ff71bSJunchao Zhang {
475076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
476076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
477076ba34aSJunchao Zhang   Mat                   mat;
4788c3ff71bSJunchao Zhang 
4798c3ff71bSJunchao Zhang   PetscFunctionBegin;
480076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
4819566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
482076ba34aSJunchao Zhang   mat  = *B;
483076ba34aSJunchao Zhang   if (A->assembled) {
484076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
485076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
486076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
487076ba34aSJunchao Zhang     /* Now copy values to B if needed */
488076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
489076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
490076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
491076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
492076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
493076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
494076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
495076ba34aSJunchao Zhang       }
496076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
497076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
498076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
499076ba34aSJunchao Zhang     }
500076ba34aSJunchao Zhang     mat->spptr = bkok;
501076ba34aSJunchao Zhang   }
502076ba34aSJunchao Zhang 
5039566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5049566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
5059566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
5069566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5078c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5088c3ff71bSJunchao Zhang }
5098c3ff71bSJunchao Zhang 
5100ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5110ecb592aSJunchao Zhang {
5120ecb592aSJunchao Zhang   Mat               At;
513ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5140ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5150ecb592aSJunchao Zhang 
5160ecb592aSJunchao Zhang   PetscFunctionBegin;
5179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
5180ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
519ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
5209566063dSJacob Faibussowitsch     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5219566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
5220ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5239566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
5240ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5250ecb592aSJunchao Zhang     if ((*B)->assembled) {
5260ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5279566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5289566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
5290ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5300ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5310ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5320ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5339566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
5349566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5350ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5360ecb592aSJunchao Zhang   }
5370ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5380ecb592aSJunchao Zhang }
5390ecb592aSJunchao Zhang 
5408c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5418c3ff71bSJunchao Zhang {
54286a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5438c3ff71bSJunchao Zhang 
5448c3ff71bSJunchao Zhang   PetscFunctionBegin;
54586a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54686a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5478c3ff71bSJunchao Zhang     delete aijkok;
54886a27549SJunchao Zhang   } else {
54986a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
55086a27549SJunchao Zhang   }
551cbc6b225SStefano Zampini   A->spptr = NULL;
5529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
5539566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
5549566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
5559566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
5568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5578c3ff71bSJunchao Zhang }
5588c3ff71bSJunchao Zhang 
5593f3ba80aSJunchao Zhang /*MC
5603f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5613f3ba80aSJunchao Zhang 
5623f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5633f3ba80aSJunchao Zhang 
5643f3ba80aSJunchao Zhang    Options Database Keys:
5653f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5663f3ba80aSJunchao Zhang 
5673f3ba80aSJunchao Zhang   Level: beginner
5683f3ba80aSJunchao Zhang 
5693f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
5703f3ba80aSJunchao Zhang M*/
57186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
57286a27549SJunchao Zhang {
57386a27549SJunchao Zhang   PetscFunctionBegin;
5749566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
5759566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
5769566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
57786a27549SJunchao Zhang   PetscFunctionReturn(0);
57886a27549SJunchao Zhang }
57986a27549SJunchao Zhang 
580076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
581076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
582a3f881fbSStefano Zampini {
583076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
584076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
585076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
586076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
587076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
588076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
589a3f881fbSStefano Zampini 
590a3f881fbSStefano Zampini   PetscFunctionBegin;
591076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
592076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
593076ba34aSJunchao Zhang   PetscValidPointer(C,4);
594076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
595076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
5965f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5975f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
598076ba34aSJunchao Zhang 
5999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
601076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
602076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
603076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
604076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
605076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
606076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
607076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
608076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
609076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
610076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
611076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
612076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
613076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
614076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
615076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
616076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
617076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
618076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
619076ba34aSJunchao Zhang     ca = ca_dual.view_device();
620076ba34aSJunchao Zhang     ci = ci_dual.view_device();
621076ba34aSJunchao Zhang     cj = cj_dual.view_device();
622076ba34aSJunchao Zhang 
623076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
624076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
625076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
626076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
627076ba34aSJunchao Zhang 
628076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
629076ba34aSJunchao Zhang         ci(i) = coffset;
630076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
631076ba34aSJunchao Zhang       });
632076ba34aSJunchao Zhang 
633076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634076ba34aSJunchao Zhang         if (k < alen) {
635076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
636076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
637076ba34aSJunchao Zhang         } else {
638076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
639076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
640076ba34aSJunchao Zhang         }
641076ba34aSJunchao Zhang       });
642076ba34aSJunchao Zhang     });
643076ba34aSJunchao Zhang     ca_dual.modify_device();
644076ba34aSJunchao Zhang     ci_dual.modify_device();
645076ba34aSJunchao Zhang     cj_dual.modify_device();
6469566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
6479566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
648076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
649076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
650076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
651076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
652076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
653076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
654076ba34aSJunchao Zhang 
655076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
656076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
657076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
658076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
659076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
660076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
661076ba34aSJunchao Zhang       });
662076ba34aSJunchao Zhang     });
6639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
664076ba34aSJunchao Zhang   }
665076ba34aSJunchao Zhang   PetscFunctionReturn(0);
666076ba34aSJunchao Zhang }
667076ba34aSJunchao Zhang 
668076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
669076ba34aSJunchao Zhang {
670076ba34aSJunchao Zhang   PetscFunctionBegin;
671076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
672a3f881fbSStefano Zampini   PetscFunctionReturn(0);
673a3f881fbSStefano Zampini }
674a3f881fbSStefano Zampini 
675a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
676a3f881fbSStefano Zampini {
677a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
678a3f881fbSStefano Zampini   Mat                            A,B;
679076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
680a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
681a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
682076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
683076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
684a3f881fbSStefano Zampini 
685a3f881fbSStefano Zampini   PetscFunctionBegin;
686a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6875f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
688076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
689076ba34aSJunchao Zhang 
690076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
691076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
692076ba34aSJunchao Zhang     PetscFunctionReturn(0);
693076ba34aSJunchao Zhang   }
694076ba34aSJunchao Zhang 
695076ba34aSJunchao Zhang   switch (product->type) {
696076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
697076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
698076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
699076ba34aSJunchao Zhang     default:
70098921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
701076ba34aSJunchao Zhang   }
702076ba34aSJunchao Zhang 
703a3f881fbSStefano Zampini   A     = product->A;
704a3f881fbSStefano Zampini   B     = product->B;
7059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
707a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
708a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
709a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
710076ba34aSJunchao Zhang 
7115f80ce2aSJacob Faibussowitsch   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
712076ba34aSJunchao Zhang 
713076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
714076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
715076ba34aSJunchao Zhang 
716076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
717076ba34aSJunchao Zhang   if (transA) {
7189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
719076ba34aSJunchao Zhang     transA = false;
720a3f881fbSStefano Zampini   }
721a3f881fbSStefano Zampini 
722076ba34aSJunchao Zhang   if (transB) {
7239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
724076ba34aSJunchao Zhang     transB = false;
725076ba34aSJunchao Zhang   }
7269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
7279566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
7289566063dSJacob Faibussowitsch   PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
7299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
7309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
731a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
732a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7339566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
7349566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
7359566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
736a3f881fbSStefano Zampini   c->reallocs         = 0;
737076ba34aSJunchao Zhang   C->info.mallocs     = 0;
738a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
739a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
740a3f881fbSStefano Zampini   C->num_ass++;
741a3f881fbSStefano Zampini   PetscFunctionReturn(0);
742a3f881fbSStefano Zampini }
743a3f881fbSStefano Zampini 
744a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
745a3f881fbSStefano Zampini {
746076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
747076ba34aSJunchao Zhang   MatProductType                 ptype;
748076ba34aSJunchao Zhang   Mat                            A,B;
749076ba34aSJunchao Zhang   bool                           transA,transB;
750076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
751076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
752076ba34aSJunchao Zhang   MPI_Comm                       comm;
753076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
754a3f881fbSStefano Zampini 
755a3f881fbSStefano Zampini   PetscFunctionBegin;
756a3f881fbSStefano Zampini   MatCheckProduct(C,1);
7579566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
7585f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
759a3f881fbSStefano Zampini   A       = product->A;
760a3f881fbSStefano Zampini   B       = product->B;
7619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
763a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
764a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
765076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
766076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
767076ba34aSJunchao Zhang 
768a3f881fbSStefano Zampini   ptype   = product->type;
769a3f881fbSStefano Zampini   switch (ptype) {
770076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
771076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
772076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
773a3f881fbSStefano Zampini     default:
77498921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
775a3f881fbSStefano Zampini   }
776a3f881fbSStefano Zampini 
777076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
778076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
779076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
780076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
781a3f881fbSStefano Zampini 
782076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
783076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7846ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7856ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7866ffb9508SJunchao Zhang    */
787076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
788076ba34aSJunchao Zhang 
7899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
790076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
791076ba34aSJunchao Zhang   if (transA) {
7929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
793076ba34aSJunchao Zhang     transA = false;
794076ba34aSJunchao Zhang   }
795076ba34aSJunchao Zhang 
796076ba34aSJunchao Zhang   if (transB) {
7979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
798076ba34aSJunchao Zhang     transB = false;
799076ba34aSJunchao Zhang   }
800076ba34aSJunchao Zhang 
8019566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
802076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
803076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
804076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
805076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
806076ba34aSJunchao Zhang   */
8079566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
8089566063dSJacob Faibussowitsch   PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC));
8099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
810076ba34aSJunchao Zhang 
8119566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
8129566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
813076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
814a3f881fbSStefano Zampini   PetscFunctionReturn(0);
815a3f881fbSStefano Zampini }
816a3f881fbSStefano Zampini 
817a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
818076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
819a3f881fbSStefano Zampini {
820076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
821a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822a3f881fbSStefano Zampini 
823a3f881fbSStefano Zampini   PetscFunctionBegin;
824a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
8259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
826a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
8279566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
828a3f881fbSStefano Zampini   }
829a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
830a3f881fbSStefano Zampini     switch (product->type) {
831a3f881fbSStefano Zampini     case MATPRODUCT_AB:
832a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
833a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
834a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835a3f881fbSStefano Zampini       break;
836a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
837a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
838a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
839a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840a3f881fbSStefano Zampini       break;
841a3f881fbSStefano Zampini     default:
842a3f881fbSStefano Zampini       break;
843a3f881fbSStefano Zampini     }
844a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
8459566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
846a3f881fbSStefano Zampini   }
847a3f881fbSStefano Zampini   PetscFunctionReturn(0);
848a3f881fbSStefano Zampini }
849a587d139SMark 
850f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851f0cf5187SStefano Zampini {
852f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
853f0cf5187SStefano Zampini 
854f0cf5187SStefano Zampini   PetscFunctionBegin;
8559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
857f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
858076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
8599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
8609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
8619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
862f0cf5187SStefano Zampini   PetscFunctionReturn(0);
863f0cf5187SStefano Zampini }
864f0cf5187SStefano Zampini 
865a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
866a587d139SMark {
867076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
868a587d139SMark 
869a587d139SMark   PetscFunctionBegin;
870076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8712328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
872076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
8739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
8742328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8759566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
8762328674fSJunchao Zhang   }
877a587d139SMark   PetscFunctionReturn(0);
878a587d139SMark }
879a587d139SMark 
880db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
88142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
882db78de30SJunchao Zhang {
883db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
884db78de30SJunchao Zhang 
885db78de30SJunchao Zhang   PetscFunctionBegin;
886db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887db78de30SJunchao Zhang   PetscValidPointer(kv,2);
888db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
8899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
890db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
892db78de30SJunchao Zhang   PetscFunctionReturn(0);
893db78de30SJunchao Zhang }
894db78de30SJunchao Zhang 
89542550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
896db78de30SJunchao Zhang {
897db78de30SJunchao Zhang   PetscFunctionBegin;
898db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899db78de30SJunchao Zhang   PetscValidPointer(kv,2);
900db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901db78de30SJunchao Zhang   PetscFunctionReturn(0);
902db78de30SJunchao Zhang }
903db78de30SJunchao Zhang 
90442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
905db78de30SJunchao Zhang {
906db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
907db78de30SJunchao Zhang 
908db78de30SJunchao Zhang   PetscFunctionBegin;
909db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
910db78de30SJunchao Zhang   PetscValidPointer(kv,2);
911db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
913db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
914076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
915db78de30SJunchao Zhang   PetscFunctionReturn(0);
916db78de30SJunchao Zhang }
917db78de30SJunchao Zhang 
91842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
919db78de30SJunchao Zhang {
920db78de30SJunchao Zhang   PetscFunctionBegin;
921db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
922db78de30SJunchao Zhang   PetscValidPointer(kv,2);
923db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
925db78de30SJunchao Zhang   PetscFunctionReturn(0);
926db78de30SJunchao Zhang }
927db78de30SJunchao Zhang 
92842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
929db78de30SJunchao Zhang {
930db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
931db78de30SJunchao Zhang 
932db78de30SJunchao Zhang   PetscFunctionBegin;
933db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
934db78de30SJunchao Zhang   PetscValidPointer(kv,2);
935db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
936db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
937076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
938db78de30SJunchao Zhang   PetscFunctionReturn(0);
939db78de30SJunchao Zhang }
940db78de30SJunchao Zhang 
94142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
942db78de30SJunchao Zhang {
943db78de30SJunchao Zhang   PetscFunctionBegin;
944db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945db78de30SJunchao Zhang   PetscValidPointer(kv,2);
946db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
948db78de30SJunchao Zhang   PetscFunctionReturn(0);
949db78de30SJunchao Zhang }
950db78de30SJunchao Zhang 
951c17cf699SJunchao Zhang /* Computes Y += alpha X */
952c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953a587d139SMark {
954a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
955c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
956c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
957c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
958a587d139SMark 
959a587d139SMark   PetscFunctionBegin;
960c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
961c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
9629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
9639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
9649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
965db78de30SJunchao Zhang 
966c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
967c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
968a587d139SMark     PetscBool e;
9699566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
970a587d139SMark     if (e) {
9719566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
972c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
973a587d139SMark     }
974a587d139SMark   }
975db78de30SJunchao Zhang 
976c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
977c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
978c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
979c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
980c17cf699SJunchao Zhang   */
981c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
982c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
983c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
984c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
985c17cf699SJunchao Zhang 
986c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
987c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
9889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
989c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
990c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
991c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
992c17cf699SJunchao Zhang 
993c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
994c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
995c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
996c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
997c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
998c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
999c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1000c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1001c17cf699SJunchao Zhang             q++;
1002a587d139SMark           } else {
1003c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1004c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1005c17cf699SJunchao Zhang             */
1006c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1007a587d139SMark           }
1008c17cf699SJunchao Zhang         }
1009c17cf699SJunchao Zhang       });
1010c17cf699SJunchao Zhang     });
10119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1012c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1013c17cf699SJunchao Zhang     Mat             Z;
1014c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1015c17cf699SJunchao Zhang     KernelHandle    kh;
1016c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1017c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1018c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1019c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
10209566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
10219566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y,&Z));
1022c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1023c17cf699SJunchao Zhang   }
10249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1026a587d139SMark   PetscFunctionReturn(0);
1027a587d139SMark }
1028a587d139SMark 
1029394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
103042550becSJunchao Zhang {
103142550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
103242550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
103342550becSJunchao Zhang 
103442550becSJunchao Zhang   PetscFunctionBegin;
10359566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1036394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
103742550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1038cbc6b225SStefano Zampini   delete akok;
1039cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
10409566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1041394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
104242550becSJunchao Zhang   PetscFunctionReturn(0);
104342550becSJunchao Zhang }
104442550becSJunchao Zhang 
104542550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
104642550becSJunchao Zhang {
104742550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
104842550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1049394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1050394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1051394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
105242550becSJunchao Zhang   MatScalarKokkosView         Aa;
105342550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
105442550becSJunchao Zhang   PetscMemType                memtype;
105542550becSJunchao Zhang 
105642550becSJunchao Zhang   PetscFunctionBegin;
10579566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v,&memtype));
105842550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1059394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
106042550becSJunchao Zhang   } else {
1061394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
106242550becSJunchao Zhang   }
106342550becSJunchao Zhang 
1064cbc6b225SStefano Zampini   if (imode == INSERT_VALUES) {
10659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1066cbc6b225SStefano Zampini     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
10679566063dSJacob Faibussowitsch   } else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
106842550becSJunchao Zhang 
1069cbc6b225SStefano Zampini   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1070394ed5ebSJunchao Zhang 
10719566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
10729566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
107342550becSJunchao Zhang   PetscFunctionReturn(0);
107442550becSJunchao Zhang }
107542550becSJunchao Zhang 
1076*5fbaff96SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
1077*5fbaff96SJunchao Zhang {
1078*5fbaff96SJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1079*5fbaff96SJunchao Zhang   MatScalarKokkosView         Aa;
1080*5fbaff96SJunchao Zhang   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
1081*5fbaff96SJunchao Zhang   PetscInt                    m = A->rmap->n;
1082*5fbaff96SJunchao Zhang   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
1083*5fbaff96SJunchao Zhang 
1084*5fbaff96SJunchao Zhang   PetscFunctionBegin;
1085*5fbaff96SJunchao Zhang   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
1086*5fbaff96SJunchao Zhang   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
1087*5fbaff96SJunchao Zhang     PetscScalar tmp;
1088*5fbaff96SJunchao Zhang     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
1089*5fbaff96SJunchao Zhang       tmp          = Aa(Ai(i));
1090*5fbaff96SJunchao Zhang       Aa(Ai(i))    = Aa(Adiag(i));
1091*5fbaff96SJunchao Zhang       Aa(Adiag(i)) = tmp;
1092*5fbaff96SJunchao Zhang     }
1093*5fbaff96SJunchao Zhang   });
1094*5fbaff96SJunchao Zhang   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1095*5fbaff96SJunchao Zhang   PetscFunctionReturn(0);
1096*5fbaff96SJunchao Zhang }
1097*5fbaff96SJunchao Zhang 
109886a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10998f7e8f9dSMark Adams {
11008f7e8f9dSMark Adams   PetscFunctionBegin;
11019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
11029566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
11038f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11048f7e8f9dSMark Adams   PetscFunctionReturn(0);
11058f7e8f9dSMark Adams }
11068f7e8f9dSMark Adams 
11078c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11088c3ff71bSJunchao Zhang {
1109076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1110076ba34aSJunchao Zhang 
11118c3ff71bSJunchao Zhang   PetscFunctionBegin;
1112076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11136f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11146f3d89d0SStefano Zampini 
11158c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11168c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11178c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1118a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1119f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1120a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1121076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11228c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11238c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11248c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11258c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11268c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11278c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1128076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11290ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1130152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1131076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1132076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1133076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1134076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1135076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1136076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
11377ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
113842550becSJunchao Zhang 
11399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
11409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1141076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1142076ba34aSJunchao Zhang }
1143076ba34aSJunchao Zhang 
1144076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1145076ba34aSJunchao Zhang {
1146076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1147076ba34aSJunchao Zhang   PetscInt    i,m,n;
1148076ba34aSJunchao Zhang 
1149076ba34aSJunchao Zhang   PetscFunctionBegin;
11505f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1151076ba34aSJunchao Zhang 
1152076ba34aSJunchao Zhang   m = akok->nrows();
1153076ba34aSJunchao Zhang   n = akok->ncols();
11549566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A,m,n,m,n));
11559566063dSJacob Faibussowitsch   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1156076ba34aSJunchao Zhang 
1157076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
11589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1159076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1160076ba34aSJunchao Zhang 
1161076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1162076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1163076ba34aSJunchao Zhang 
1164076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1165076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1166076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1167076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1168076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1169076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1170076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1171076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1172076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1173076ba34aSJunchao Zhang 
11749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->imax));
11759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->ilen));
1176076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1177076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1178076ba34aSJunchao Zhang   }
1179076ba34aSJunchao Zhang 
1180076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1181076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1182ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
11839566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
11849566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1185076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1186076ba34aSJunchao Zhang }
1187076ba34aSJunchao Zhang 
1188076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1189076ba34aSJunchao Zhang 
1190076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1191076ba34aSJunchao Zhang  */
1192076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1193076ba34aSJunchao Zhang {
1194076ba34aSJunchao Zhang   PetscFunctionBegin;
11959566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
11969566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
11978c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11988c3ff71bSJunchao Zhang }
11998c3ff71bSJunchao Zhang 
12008c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1201152b3e56SJunchao Zhang /*@C
12028c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12038c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12048c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12058c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12068c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12078c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12088c3ff71bSJunchao Zhang 
12098c3ff71bSJunchao Zhang    Collective
12108c3ff71bSJunchao Zhang 
12118c3ff71bSJunchao Zhang    Input Parameters:
12128c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12138c3ff71bSJunchao Zhang .  m - number of rows
12148c3ff71bSJunchao Zhang .  n - number of columns
12158c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12168c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12178c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12188c3ff71bSJunchao Zhang 
12198c3ff71bSJunchao Zhang    Output Parameter:
12208c3ff71bSJunchao Zhang .  A - the matrix
12218c3ff71bSJunchao Zhang 
12228c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12238c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12248c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12258c3ff71bSJunchao Zhang 
12268c3ff71bSJunchao Zhang    Notes:
12278c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12288c3ff71bSJunchao Zhang 
12298c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12308c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12318c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12328c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12338c3ff71bSJunchao Zhang 
12348c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12358c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12368c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12378c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12388c3ff71bSJunchao Zhang 
12398c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12408c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12418c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12428c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12438c3ff71bSJunchao Zhang 
12448c3ff71bSJunchao Zhang    Level: intermediate
12458c3ff71bSJunchao Zhang 
1246076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
12478c3ff71bSJunchao Zhang @*/
12488c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12498c3ff71bSJunchao Zhang {
12508c3ff71bSJunchao Zhang   PetscFunctionBegin;
12519566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
12529566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
12539566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
12549566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
12559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
12568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12578c3ff71bSJunchao Zhang }
1258930e68a5SMark Adams 
12598f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12608f7e8f9dSMark Adams //
126146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12628f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12638f7e8f9dSMark Adams //
12648f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1265930e68a5SMark Adams {
12668f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12678f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12688f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
12698f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1270300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1271076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1272076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12738f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
127446804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1275930e68a5SMark Adams 
1276930e68a5SMark Adams   PetscFunctionBegin;
12772c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12789566063dSJacob Faibussowitsch   PetscCall(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
12792c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12809566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&r_h));
12819566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol,&ic_h));
12829566063dSJacob Faibussowitsch   PetscCall(ISGetSize(isicol,&nc));
12839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
12849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
12858f7e8f9dSMark Adams   {
12868f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12878f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12888f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12898f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12908f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12918f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12928f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12938f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12948f7e8f9dSMark Adams     size_t flops_h = 0.0;
12958f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12968f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12978f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12988f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12998f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13008f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13018f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13028f7e8f9dSMark Adams     // Fill A --> fact
13038f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1304042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13058f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13068f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13078f7e8f9dSMark Adams         // zero rows of B
13088f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13098f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13108f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13118f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13128f7e8f9dSMark Adams             /* zero (unfactored row) */
13138f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13148f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13158f7e8f9dSMark Adams           });
13168f7e8f9dSMark Adams         // copy A into B
13178f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13188f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13198f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13208f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13218f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13228f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13238f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13248f7e8f9dSMark Adams               PetscScalar vala = av[j];
13258f7e8f9dSMark Adams               if (colb == rowb) {
13268f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13278f7e8f9dSMark Adams               } else {
13288f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13298f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13308f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13318f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13328f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13338f7e8f9dSMark Adams                     pba[j] = vala;
13348f7e8f9dSMark Adams                     set++;
13358f7e8f9dSMark Adams                     break;
13368f7e8f9dSMark Adams                   }
13378f7e8f9dSMark Adams                 }
13388f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13398f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13408f1da0b2SJunchao Zhang                #endif
13418f7e8f9dSMark Adams               }
13428f7e8f9dSMark Adams             }
13438f7e8f9dSMark Adams           });
13448f7e8f9dSMark Adams       });
13458f7e8f9dSMark Adams     Kokkos::fence();
1346930e68a5SMark Adams 
13478f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13488f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13498f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13508f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1351042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13528f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13538f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13548f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13558f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13568f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13578f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13588f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13598f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13608f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13618f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13628f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13638f7e8f9dSMark Adams               else {
13648f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13658f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13668f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13678f7e8f9dSMark Adams                 size_t         st_idx;
13688f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13698f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13708f7e8f9dSMark Adams                 // get column, there has got to be a better way
13718f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13728f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13738f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13748f7e8f9dSMark Adams                       idx = j; // output
13758f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13768f7e8f9dSMark Adams                     }
13778f7e8f9dSMark Adams                 }, st_idx);
13788f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13798f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
138099551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
138199551766SMark Adams #endif
138299551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13838f7e8f9dSMark Adams                 // U(i+1,:end)
13848f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13858f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13868f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13878f7e8f9dSMark Adams                       if (col==myk) {
13888f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13898f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13908f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13918f7e8f9dSMark Adams                       } else {
13928f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13938f7e8f9dSMark Adams                         PetscInt       high, low;
13948f7e8f9dSMark Adams                         const PetscInt *startj;
13958f7e8f9dSMark Adams                         if (col<myk) { // L
13968f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13978f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13988f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13998f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14008f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14018f7e8f9dSMark Adams                         } else {
14028f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14038f7e8f9dSMark Adams                           start = ba_d + idx;
14048f7e8f9dSMark Adams                           startj= bj_d + idx;
14058f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14068f7e8f9dSMark Adams                         }
14078f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14088f7e8f9dSMark Adams                         low  = 0;
14098f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14108f7e8f9dSMark Adams                         while (high-low > 5) {
14118f7e8f9dSMark Adams                           int t = (low+high)/2;
14128f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14138f7e8f9dSMark Adams                           else                 low  = t;
14148f7e8f9dSMark Adams                         }
14158f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14168f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14178f7e8f9dSMark Adams                         }
14188f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
141999551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
142099551766SMark Adams #endif
14218f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14228f7e8f9dSMark Adams                       }
14238f7e8f9dSMark Adams                     });
14248f7e8f9dSMark Adams               }
14258f7e8f9dSMark Adams             });
14268f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14278f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14288f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14298f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14308f7e8f9dSMark Adams       });
14318f7e8f9dSMark Adams     Kokkos::fence();
14328f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14339566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
14348f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14358f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14368f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14378f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14388f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14398f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14408f7e8f9dSMark Adams             if (i < end) {
14418f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14428f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14438f7e8f9dSMark Adams             }
14448f7e8f9dSMark Adams           });
14458f7e8f9dSMark Adams       });
14468f7e8f9dSMark Adams   }
14479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14489566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol,&ic_h));
14499566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&r_h));
14508f7e8f9dSMark Adams 
14519566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
14529566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isicol,&col_identity));
14538f7e8f9dSMark Adams   if (b->inode.size) {
14548f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14558f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14568f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14578f7e8f9dSMark Adams   } else {
14588f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14598f7e8f9dSMark Adams   }
14608f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
14628f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14638f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14648f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14658f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14668f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14678f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14688f7e8f9dSMark Adams 
1469930e68a5SMark Adams   PetscFunctionReturn(0);
1470930e68a5SMark Adams }
1471930e68a5SMark Adams 
147286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1473930e68a5SMark Adams {
1474930e68a5SMark Adams   PetscFunctionBegin;
14759566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
147686a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
147786a27549SJunchao Zhang   PetscFunctionReturn(0);
147886a27549SJunchao Zhang }
147986a27549SJunchao Zhang 
148086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
148186a27549SJunchao Zhang {
148286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
148386a27549SJunchao Zhang 
148486a27549SJunchao Zhang   PetscFunctionBegin;
148586a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
148686a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
148786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
148886a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
148986a27549SJunchao Zhang   }
149086a27549SJunchao Zhang   PetscFunctionReturn(0);
149186a27549SJunchao Zhang }
149286a27549SJunchao Zhang 
149386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
149486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
149586a27549SJunchao Zhang {
149686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1497076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
149886a27549SJunchao Zhang 
149986a27549SJunchao Zhang   PetscFunctionBegin;
150086a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
150186a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1502076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
150386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1504076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1505076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
150686a27549SJunchao Zhang 
150786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1508076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1509076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1510076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
151186a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
151286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
151386a27549SJunchao Zhang 
151486a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
151586a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
151686a27549SJunchao Zhang     */
1517076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1518076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
151986a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
152086a27549SJunchao Zhang 
152186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
152286a27549SJunchao Zhang 
152386a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1524076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
152586a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1526076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1527076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
152886a27549SJunchao Zhang 
152986a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1530076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1531076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1532076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
153386a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
153486a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
153586a27549SJunchao Zhang 
153686a27549SJunchao Zhang     /* Sort indices. See comments above */
1537076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1538076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
153986a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
154086a27549SJunchao Zhang 
154186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
154286a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
154386a27549SJunchao Zhang   }
154486a27549SJunchao Zhang   PetscFunctionReturn(0);
154586a27549SJunchao Zhang }
154686a27549SJunchao Zhang 
154786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
154886a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
154986a27549SJunchao Zhang {
155086a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
155186a27549SJunchao Zhang   PetscScalarKokkosView          xv;
155286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
155386a27549SJunchao Zhang 
155486a27549SJunchao Zhang   PetscFunctionBegin;
15559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
15569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
15579566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
15589566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
155986a27549SJunchao Zhang   /* Solve L tmpv = b */
15609566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
156186a27549SJunchao Zhang   /* Solve Ux = tmpv */
15629566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
15639566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
15649566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
15659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
156686a27549SJunchao Zhang   PetscFunctionReturn(0);
156786a27549SJunchao Zhang }
156886a27549SJunchao Zhang 
1569076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
157086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
157186a27549SJunchao Zhang {
157286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
157386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
157486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
157586a27549SJunchao Zhang 
157686a27549SJunchao Zhang   PetscFunctionBegin;
15779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
15789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
15799566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
15809566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
158186a27549SJunchao Zhang   /* Solve U^T tmpv = b */
158286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang   /* Solve L^T x = tmpv */
158586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
15869566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
15879566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
15889566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
158986a27549SJunchao Zhang   PetscFunctionReturn(0);
159086a27549SJunchao Zhang }
159186a27549SJunchao Zhang 
159286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
159386a27549SJunchao Zhang {
159486a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
159586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
159686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
159786a27549SJunchao Zhang 
159886a27549SJunchao Zhang   PetscFunctionBegin;
15999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1601076ba34aSJunchao Zhang 
1602076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1603076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1604076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1605076ba34aSJunchao Zhang 
1606076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
160786a27549SJunchao Zhang 
160886a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
160986a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
161086a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
161186a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
161286a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
161386a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
161486a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
161586a27549SJunchao Zhang 
161686a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
161786a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
161886a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1619eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
16209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
162186a27549SJunchao Zhang   PetscFunctionReturn(0);
162286a27549SJunchao Zhang }
162386a27549SJunchao Zhang 
162486a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
162586a27549SJunchao Zhang {
162686a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
162786a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
162886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
162986a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
163086a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
163186a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
163286a27549SJunchao Zhang 
163386a27549SJunchao Zhang   PetscFunctionBegin;
16349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
163586a27549SJunchao Zhang   /* Rebuild factors */
163686a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
163786a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
163886a27549SJunchao Zhang 
163986a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
164086a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
164186a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
164286a27549SJunchao Zhang 
164386a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
164486a27549SJunchao Zhang 
164586a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
164686a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
164786a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
164886a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
164986a27549SJunchao Zhang 
165086a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1651076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1652076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1653076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
165486a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
165586a27549SJunchao Zhang 
165686a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
165786a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
165886a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
165986a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
166086a27549SJunchao Zhang 
166186a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
166286a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
166386a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
166486a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
166586a27549SJunchao Zhang  #else
166686a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
166786a27549SJunchao Zhang  #endif
166886a27549SJunchao Zhang 
166986a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
167086a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
167186a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
167286a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
167386a27549SJunchao Zhang 
167486a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
16759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
167686a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
167786a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
167886a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
167986a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
168086a27549SJunchao Zhang 
168186a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
168286a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1683930e68a5SMark Adams   PetscFunctionReturn(0);
1684930e68a5SMark Adams }
1685930e68a5SMark Adams 
16868f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16878f7e8f9dSMark Adams {
16888f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16898f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1690930e68a5SMark Adams 
16918f7e8f9dSMark Adams   PetscFunctionBegin;
16929566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
16938f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16948f7e8f9dSMark Adams   // move B data into Kokkos
16959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
16969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
16978f7e8f9dSMark Adams   {
16988f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1699300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17008f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1701300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1702300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17038f7e8f9dSMark Adams     }
17048f7e8f9dSMark Adams   }
17058f7e8f9dSMark Adams   PetscFunctionReturn(0);
17068f7e8f9dSMark Adams }
17078f7e8f9dSMark Adams 
170886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1709930e68a5SMark Adams {
1710930e68a5SMark Adams   PetscFunctionBegin;
1711930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1712930e68a5SMark Adams   PetscFunctionReturn(0);
1713930e68a5SMark Adams }
1714930e68a5SMark Adams 
17158f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17168f7e8f9dSMark Adams {
17178f7e8f9dSMark Adams   PetscFunctionBegin;
17188f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17198f7e8f9dSMark Adams   PetscFunctionReturn(0);
17208f7e8f9dSMark Adams }
17218f7e8f9dSMark Adams 
1722930e68a5SMark Adams /*MC
172386a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
172486a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1725930e68a5SMark Adams 
1726930e68a5SMark Adams   Level: beginner
1727930e68a5SMark Adams 
17283f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1729930e68a5SMark Adams M*/
173086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1731930e68a5SMark Adams {
1732930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1733930e68a5SMark Adams 
1734930e68a5SMark Adams   PetscFunctionBegin;
17359566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
17369566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1737930e68a5SMark Adams   (*B)->factortype = ftype;
17389566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
17399566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1740930e68a5SMark Adams 
17418f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17429566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
174386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
174486a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
174586a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
17469566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
174786a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
174886a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
174998921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1750930e68a5SMark Adams 
17519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
17529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1753930e68a5SMark Adams   PetscFunctionReturn(0);
1754930e68a5SMark Adams }
17558f7e8f9dSMark Adams 
17568f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17578f7e8f9dSMark Adams {
17588f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17598f7e8f9dSMark Adams 
17608f7e8f9dSMark Adams   PetscFunctionBegin;
17619566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
17629566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
17638f7e8f9dSMark Adams   (*B)->factortype = ftype;
1764f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17659566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
17669566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
17678f7e8f9dSMark Adams 
17688f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17699566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
17708f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17718f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17728f7e8f9dSMark Adams 
17739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
17749566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
17758f7e8f9dSMark Adams   PetscFunctionReturn(0);
17768f7e8f9dSMark Adams }
177786a27549SJunchao Zhang 
177886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
177986a27549SJunchao Zhang {
178086a27549SJunchao Zhang   PetscFunctionBegin;
17819566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
17829566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
17839566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
178486a27549SJunchao Zhang   PetscFunctionReturn(0);
178586a27549SJunchao Zhang }
178686a27549SJunchao Zhang 
1787076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1788076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1789076ba34aSJunchao Zhang {
1790076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1791076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1792076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1793076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1794076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1795076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1796076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1797076ba34aSJunchao Zhang 
1798076ba34aSJunchao Zhang   PetscFunctionBegin;
17999566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1800076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
18019566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1802076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
18039566063dSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]));
1804076ba34aSJunchao Zhang     }
18059566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1806076ba34aSJunchao Zhang   }
1807076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1808076ba34aSJunchao Zhang }
1809