xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f98996d33ec2508aac590d2156ec44b6b751ff15)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
188c3ff71bSJunchao Zhang 
19*f98996d3SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3,6,99)
20*f98996d3SJunchao Zhang   #include <KokkosSparse_Utils.hpp>
21*f98996d3SJunchao Zhang   using KokkosSparse::Impl::transpose_matrix;
22*f98996d3SJunchao Zhang   using KokkosSparse::sort_crs_matrix;
23*f98996d3SJunchao Zhang #else
24*f98996d3SJunchao Zhang   #include <KokkosKernels_Sorting.hpp>
25*f98996d3SJunchao Zhang   using KokkosKernels::Impl::transpose_matrix;
26*f98996d3SJunchao Zhang   using KokkosKernels::sort_crs_matrix;
27*f98996d3SJunchao Zhang #endif
28*f98996d3SJunchao Zhang 
298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
308c3ff71bSJunchao Zhang 
31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
32076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
33076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
34076ba34aSJunchao Zhang  */
358c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
368c3ff71bSJunchao Zhang {
37076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
38076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
398c3ff71bSJunchao Zhang 
408c3ff71bSJunchao Zhang   PetscFunctionBegin;
41076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
429566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
43076ba34aSJunchao Zhang 
44076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
45076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
46076ba34aSJunchao Zhang 
47076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
48076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
49076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
50076ba34aSJunchao Zhang   */
51076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
52076ba34aSJunchao Zhang     delete aijkok;
53076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
54076ba34aSJunchao Zhang     A->spptr = aijkok;
55076ba34aSJunchao Zhang   }
56076ba34aSJunchao Zhang 
575519a089SJose E. Roman   if (aijkok->device_mat_d.data()) {
58a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
59a587d139SMark   }
608c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
618c3ff71bSJunchao Zhang }
628c3ff71bSJunchao Zhang 
6386a27549SJunchao Zhang /* Sync CSR data to device if not yet */
64076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
658c3ff71bSJunchao Zhang {
668c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
678c3ff71bSJunchao Zhang 
688c3ff71bSJunchao Zhang   PetscFunctionBegin;
695f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
705f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
71076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
72076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
73580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
7486a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
758c3ff71bSJunchao Zhang   }
768c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
778c3ff71bSJunchao Zhang }
788c3ff71bSJunchao Zhang 
79076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
80fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
8186a27549SJunchao Zhang {
8286a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8386a27549SJunchao Zhang 
8486a27549SJunchao Zhang   PetscFunctionBegin;
855f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8686a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8786a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8886a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8986a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
919566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
9286a27549SJunchao Zhang   PetscFunctionReturn(0);
9386a27549SJunchao Zhang }
9486a27549SJunchao Zhang 
95f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
96f0cf5187SStefano Zampini {
97f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
98f0cf5187SStefano Zampini 
99f0cf5187SStefano Zampini   PetscFunctionBegin;
100f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
10186a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
1025f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
1035f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
104076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
105f0cf5187SStefano Zampini   PetscFunctionReturn(0);
106f0cf5187SStefano Zampini }
107f0cf5187SStefano Zampini 
108f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
109f0cf5187SStefano Zampini {
110076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
111f0cf5187SStefano Zampini 
112f0cf5187SStefano Zampini   PetscFunctionBegin;
1135519a089SJose E. Roman   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
1145519a089SJose E. Roman     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
1155519a089SJose E. Roman     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
1165519a089SJose E. Roman     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
1175519a089SJose E. Roman   */
1185519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
119076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
120076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
121076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
122076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
123076ba34aSJunchao Zhang   }
124076ba34aSJunchao Zhang   PetscFunctionReturn(0);
125076ba34aSJunchao Zhang }
126076ba34aSJunchao Zhang 
127076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
128076ba34aSJunchao Zhang {
129076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130076ba34aSJunchao Zhang 
131076ba34aSJunchao Zhang   PetscFunctionBegin;
1325519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
133076ba34aSJunchao Zhang   PetscFunctionReturn(0);
134076ba34aSJunchao Zhang }
135076ba34aSJunchao Zhang 
136076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137076ba34aSJunchao Zhang {
138076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
139076ba34aSJunchao Zhang 
140076ba34aSJunchao Zhang   PetscFunctionBegin;
1415519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
142076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
143076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1442328674fSJunchao Zhang   } else {
1452328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1462328674fSJunchao Zhang   }
147076ba34aSJunchao Zhang   PetscFunctionReturn(0);
148076ba34aSJunchao Zhang }
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
151076ba34aSJunchao Zhang {
152076ba34aSJunchao Zhang   PetscFunctionBegin;
153076ba34aSJunchao Zhang   *array = NULL;
154076ba34aSJunchao Zhang   PetscFunctionReturn(0);
155076ba34aSJunchao Zhang }
156076ba34aSJunchao Zhang 
157076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
158076ba34aSJunchao Zhang {
159076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160076ba34aSJunchao Zhang 
161076ba34aSJunchao Zhang   PetscFunctionBegin;
1625519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1642328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1652328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1662328674fSJunchao Zhang   }
167076ba34aSJunchao Zhang   PetscFunctionReturn(0);
168076ba34aSJunchao Zhang }
169076ba34aSJunchao Zhang 
170076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
171076ba34aSJunchao Zhang {
172076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
173076ba34aSJunchao Zhang 
174076ba34aSJunchao Zhang   PetscFunctionBegin;
1755519a089SJose E. Roman   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
176076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
177076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1782328674fSJunchao Zhang   }
179f0cf5187SStefano Zampini   PetscFunctionReturn(0);
180f0cf5187SStefano Zampini }
181f0cf5187SStefano Zampini 
1827ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1837ee59b9bSJunchao Zhang {
1847ee59b9bSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1857ee59b9bSJunchao Zhang 
1867ee59b9bSJunchao Zhang   PetscFunctionBegin;
1877ee59b9bSJunchao Zhang   PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL");
1887ee59b9bSJunchao Zhang 
1897ee59b9bSJunchao Zhang   if (i) *i = aijkok->i_device_data();
1907ee59b9bSJunchao Zhang   if (j) *j = aijkok->j_device_data();
1917ee59b9bSJunchao Zhang   if (a) {
1927ee59b9bSJunchao Zhang     aijkok->a_dual.sync_device();
1937ee59b9bSJunchao Zhang     *a = aijkok->a_device_data();
1947ee59b9bSJunchao Zhang   }
1957ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
1967ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
1977ee59b9bSJunchao Zhang }
1987ee59b9bSJunchao Zhang 
199a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
200042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
201a587d139SMark {
202042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
203042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
204a587d139SMark 
205a587d139SMark   PetscFunctionBegin;
2065f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
208a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
209a587d139SMark   PetscFunctionReturn(0);
210a587d139SMark }
211a587d139SMark 
212a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
213042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
214a587d139SMark {
215042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
216a587d139SMark 
217a587d139SMark   PetscFunctionBegin;
218a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
219a587d139SMark     *d_mat = aijkok->device_mat_d.data();
220a587d139SMark   } else {
2219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
222a587d139SMark     *d_mat  = NULL;
223a587d139SMark   }
224a587d139SMark   PetscFunctionReturn(0);
225a587d139SMark }
226076ba34aSJunchao Zhang 
227076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
228076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
229152b3e56SJunchao Zhang {
230152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
231152b3e56SJunchao Zhang 
232152b3e56SJunchao Zhang   PetscFunctionBegin;
2335f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
234076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
235076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
2369566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
237*f98996d3SJunchao Zhang     PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat));
238*f98996d3SJunchao Zhang     PetscCallCXX(sort_crs_matrix(aijkok->csrmatT));
23986a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
240076ba34aSJunchao Zhang   }
241076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
242152b3e56SJunchao Zhang   PetscFunctionReturn(0);
243152b3e56SJunchao Zhang }
244152b3e56SJunchao Zhang 
245076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
246076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
247152b3e56SJunchao Zhang {
248152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
249152b3e56SJunchao Zhang 
250152b3e56SJunchao Zhang   PetscFunctionBegin;
2519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2525f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
253076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
2549566063dSJacob Faibussowitsch     PetscCallCXX(aijkok->a_dual.sync_device());
255*f98996d3SJunchao Zhang     PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat));
256*f98996d3SJunchao Zhang     PetscCallCXX(sort_crs_matrix(aijkok->csrmatH));
257076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
258076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
259076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
260076ba34aSJunchao Zhang    #endif
26186a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
262076ba34aSJunchao Zhang   }
263076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
2649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
265152b3e56SJunchao Zhang   PetscFunctionReturn(0);
266152b3e56SJunchao Zhang }
267a587d139SMark 
2688c3ff71bSJunchao Zhang /* y = A x */
2698c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2708c3ff71bSJunchao Zhang {
2718c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
272152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
273152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang   PetscFunctionBegin;
2769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2789566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
2799566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
2808c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
281152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
2829566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
2839566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
284076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
2859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
2869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2878c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2888c3ff71bSJunchao Zhang }
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang /* y = A^T x */
2918c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2928c3ff71bSJunchao Zhang {
2938c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
294152b3e56SJunchao Zhang   const char                       *mode;
295152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
296152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
297076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2988c3ff71bSJunchao Zhang 
2998c3ff71bSJunchao Zhang   PetscFunctionBegin;
3009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3029566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3039566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
304152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
306152b3e56SJunchao Zhang     mode = "N";
307152b3e56SJunchao Zhang   } else {
308076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
309076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
310152b3e56SJunchao Zhang     mode = "T";
311152b3e56SJunchao Zhang   }
312076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
3139566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3149566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3178c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3188c3ff71bSJunchao Zhang }
3198c3ff71bSJunchao Zhang 
3208c3ff71bSJunchao Zhang /* y = A^H x */
3218c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
3228c3ff71bSJunchao Zhang {
3238c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
324152b3e56SJunchao Zhang   const char                       *mode;
325152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
326152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
327076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3288c3ff71bSJunchao Zhang 
3298c3ff71bSJunchao Zhang   PetscFunctionBegin;
3309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3329566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3339566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(yy,&yv));
334152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
336152b3e56SJunchao Zhang     mode = "N";
337152b3e56SJunchao Zhang   } else {
338076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
339076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
340152b3e56SJunchao Zhang     mode = "C";
341152b3e56SJunchao Zhang   }
342076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
3439566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3449566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(yy,&yv));
3459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
3469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3478c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3488c3ff71bSJunchao Zhang }
3498c3ff71bSJunchao Zhang 
3508c3ff71bSJunchao Zhang /* z = A x + y */
3518c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3528c3ff71bSJunchao Zhang {
3538c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
354152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
355152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3568c3ff71bSJunchao Zhang 
3578c3ff71bSJunchao Zhang   PetscFunctionBegin;
3589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3609566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3619566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3629566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3638c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3648c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
365152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
3669566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
3679566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
3689566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
3699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
3709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3718c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3728c3ff71bSJunchao Zhang }
3738c3ff71bSJunchao Zhang 
3748c3ff71bSJunchao Zhang /* z = A^T x + y */
3758c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3768c3ff71bSJunchao Zhang {
3778c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
378152b3e56SJunchao Zhang   const char                       *mode;
379152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
380152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
381076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3828c3ff71bSJunchao Zhang 
3838c3ff71bSJunchao Zhang   PetscFunctionBegin;
3849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
3869566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
3879566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
3889566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
3898c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
390152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
3919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
392152b3e56SJunchao Zhang     mode = "N";
393152b3e56SJunchao Zhang   } else {
394076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
395076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
396152b3e56SJunchao Zhang     mode = "T";
397152b3e56SJunchao Zhang   }
398076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
3999566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
4009566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
4019566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
4029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4058c3ff71bSJunchao Zhang }
4068c3ff71bSJunchao Zhang 
4078c3ff71bSJunchao Zhang /* z = A^H x + y */
4088c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
4098c3ff71bSJunchao Zhang {
4108c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
411152b3e56SJunchao Zhang   const char                       *mode;
412152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
413152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
414076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
4158c3ff71bSJunchao Zhang 
4168c3ff71bSJunchao Zhang   PetscFunctionBegin;
4179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
4199566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(xx,&xv));
4209566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(yy,&yv));
4219566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(zz,&zv));
4228c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
423152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
4249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
425152b3e56SJunchao Zhang     mode = "N";
426152b3e56SJunchao Zhang   } else {
427076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
428076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
429152b3e56SJunchao Zhang     mode = "C";
430152b3e56SJunchao Zhang   }
431076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
4329566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(xx,&xv));
4339566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(yy,&yv));
4349566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(zz,&zv));
4359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz()));
4369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
437152b3e56SJunchao Zhang   PetscFunctionReturn(0);
438152b3e56SJunchao Zhang }
439152b3e56SJunchao Zhang 
440152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
441152b3e56SJunchao Zhang {
442152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
443152b3e56SJunchao Zhang 
444152b3e56SJunchao Zhang   PetscFunctionBegin;
445152b3e56SJunchao Zhang   switch (op) {
446152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
447152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
4489566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
449152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
450152b3e56SJunchao Zhang       break;
451152b3e56SJunchao Zhang     default:
4529566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
453152b3e56SJunchao Zhang       break;
454152b3e56SJunchao Zhang   }
4558c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4568c3ff71bSJunchao Zhang }
4578c3ff71bSJunchao Zhang 
458076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4593d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4608c3ff71bSJunchao Zhang {
461076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4628c3ff71bSJunchao Zhang 
4638c3ff71bSJunchao Zhang   PetscFunctionBegin;
4649566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
465076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
4669566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
4678c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
4689566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
469076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4705f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
4719566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->defaultvectype));
4729566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
4739566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
4749566063dSJacob Faibussowitsch     PetscCall(MatSetOps_SeqAIJKokkos(A));
475076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
476394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4775f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
478076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4798c3ff71bSJunchao Zhang     }
480076ba34aSJunchao Zhang   }
4818c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4828c3ff71bSJunchao Zhang }
4838c3ff71bSJunchao Zhang 
484076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
485076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
486076ba34aSJunchao Zhang  */
487076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4888c3ff71bSJunchao Zhang {
489076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
490076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
491076ba34aSJunchao Zhang   Mat                   mat;
4928c3ff71bSJunchao Zhang 
4938c3ff71bSJunchao Zhang   PetscFunctionBegin;
494076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
4959566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
496076ba34aSJunchao Zhang   mat  = *B;
497076ba34aSJunchao Zhang   if (A->assembled) {
498076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
499076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
500076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
501076ba34aSJunchao Zhang     /* Now copy values to B if needed */
502076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
503076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
504076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
505076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
506076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
507076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
508076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
509076ba34aSJunchao Zhang       }
510076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
511076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
512076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
513076ba34aSJunchao Zhang     }
514076ba34aSJunchao Zhang     mat->spptr = bkok;
515076ba34aSJunchao Zhang   }
516076ba34aSJunchao Zhang 
5179566063dSJacob Faibussowitsch   PetscCall(PetscFree(mat->defaultvectype));
5189566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
5199566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
5209566063dSJacob Faibussowitsch   PetscCall(MatSetOps_SeqAIJKokkos(mat));
5218c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5228c3ff71bSJunchao Zhang }
5238c3ff71bSJunchao Zhang 
5240ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5250ecb592aSJunchao Zhang {
5260ecb592aSJunchao Zhang   Mat               At;
527ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5280ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5290ecb592aSJunchao Zhang 
5300ecb592aSJunchao Zhang   PetscFunctionBegin;
5317fb60732SBarry Smith   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A,*B));
5329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
5330ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
534ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
5359566063dSJacob Faibussowitsch     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5369566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
5370ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5389566063dSJacob Faibussowitsch     else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
5390ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5400ecb592aSJunchao Zhang     if ((*B)->assembled) {
5410ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5429566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
5440ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5450ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5460ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5470ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5489566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(a_h,internT->values));
5499566063dSJacob Faibussowitsch       PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5500ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5510ecb592aSJunchao Zhang   }
5520ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5530ecb592aSJunchao Zhang }
5540ecb592aSJunchao Zhang 
5558c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5568c3ff71bSJunchao Zhang {
55786a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5588c3ff71bSJunchao Zhang 
5598c3ff71bSJunchao Zhang   PetscFunctionBegin;
56086a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
56186a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5628c3ff71bSJunchao Zhang     delete aijkok;
56386a27549SJunchao Zhang   } else {
56486a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
56586a27549SJunchao Zhang   }
566cbc6b225SStefano Zampini   A->spptr = NULL;
5679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
5689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
5699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
5709566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
5718c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5728c3ff71bSJunchao Zhang }
5738c3ff71bSJunchao Zhang 
5743f3ba80aSJunchao Zhang /*MC
5753f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5763f3ba80aSJunchao Zhang 
5773f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5783f3ba80aSJunchao Zhang 
5793f3ba80aSJunchao Zhang    Options Database Keys:
5803f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5813f3ba80aSJunchao Zhang 
5823f3ba80aSJunchao Zhang   Level: beginner
5833f3ba80aSJunchao Zhang 
584db781477SPatrick Sanan .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
5853f3ba80aSJunchao Zhang M*/
58686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
58786a27549SJunchao Zhang {
58886a27549SJunchao Zhang   PetscFunctionBegin;
5899566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
5909566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(A));
5919566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
59286a27549SJunchao Zhang   PetscFunctionReturn(0);
59386a27549SJunchao Zhang }
59486a27549SJunchao Zhang 
595076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
596076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
597a3f881fbSStefano Zampini {
598076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
599076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
600076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
601076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
602076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
603076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
604a3f881fbSStefano Zampini 
605a3f881fbSStefano Zampini   PetscFunctionBegin;
606076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
607076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
608076ba34aSJunchao Zhang   PetscValidPointer(C,4);
609076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
610076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
6115f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
6125f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
613076ba34aSJunchao Zhang 
6149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
6159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
616076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
617076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
618076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
619076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
620076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
621076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
622076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
623076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
624076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
625076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
626076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
627076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
628076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
629076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
630076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
631076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
632076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
633076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
634076ba34aSJunchao Zhang     ca = ca_dual.view_device();
635076ba34aSJunchao Zhang     ci = ci_dual.view_device();
636076ba34aSJunchao Zhang     cj = cj_dual.view_device();
637076ba34aSJunchao Zhang 
638076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
639076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
640076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
641076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
642076ba34aSJunchao Zhang 
643076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
644076ba34aSJunchao Zhang         ci(i) = coffset;
645076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
646076ba34aSJunchao Zhang       });
647076ba34aSJunchao Zhang 
648076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
649076ba34aSJunchao Zhang         if (k < alen) {
650076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
651076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
652076ba34aSJunchao Zhang         } else {
653076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
654076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
655076ba34aSJunchao Zhang         }
656076ba34aSJunchao Zhang       });
657076ba34aSJunchao Zhang     });
658076ba34aSJunchao Zhang     ca_dual.modify_device();
659076ba34aSJunchao Zhang     ci_dual.modify_device();
660076ba34aSJunchao Zhang     cj_dual.modify_device();
6619566063dSJacob Faibussowitsch     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
6629566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
663076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
664076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
665076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
666076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
667076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
668076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
669076ba34aSJunchao Zhang 
670076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
671076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
672076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
673076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
674076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
675076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
676076ba34aSJunchao Zhang       });
677076ba34aSJunchao Zhang     });
6789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
679076ba34aSJunchao Zhang   }
680076ba34aSJunchao Zhang   PetscFunctionReturn(0);
681076ba34aSJunchao Zhang }
682076ba34aSJunchao Zhang 
683076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
684076ba34aSJunchao Zhang {
685076ba34aSJunchao Zhang   PetscFunctionBegin;
686076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
687a3f881fbSStefano Zampini   PetscFunctionReturn(0);
688a3f881fbSStefano Zampini }
689a3f881fbSStefano Zampini 
690a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
691a3f881fbSStefano Zampini {
692a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
693a3f881fbSStefano Zampini   Mat                            A,B;
694076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
695a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
697076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
698076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
699a3f881fbSStefano Zampini 
700a3f881fbSStefano Zampini   PetscFunctionBegin;
701a3f881fbSStefano Zampini   MatCheckProduct(C,1);
7025f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
703076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
704076ba34aSJunchao Zhang 
705076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
706076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
707076ba34aSJunchao Zhang     PetscFunctionReturn(0);
708076ba34aSJunchao Zhang   }
709076ba34aSJunchao Zhang 
710076ba34aSJunchao Zhang   switch (product->type) {
711076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
712076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
713076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
714076ba34aSJunchao Zhang     default:
71598921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
716076ba34aSJunchao Zhang   }
717076ba34aSJunchao Zhang 
718a3f881fbSStefano Zampini   A     = product->A;
719a3f881fbSStefano Zampini   B     = product->B;
7209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
722a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
723a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
724a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
725076ba34aSJunchao Zhang 
7265f80ce2aSJacob Faibussowitsch   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
727076ba34aSJunchao Zhang 
728076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
729076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
730076ba34aSJunchao Zhang 
731076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
732076ba34aSJunchao Zhang   if (transA) {
7339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
734076ba34aSJunchao Zhang     transA = false;
735a3f881fbSStefano Zampini   }
736a3f881fbSStefano Zampini 
737076ba34aSJunchao Zhang   if (transB) {
7389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
739076ba34aSJunchao Zhang     transB = false;
740076ba34aSJunchao Zhang   }
7419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
7429566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
743*f98996d3SJunchao Zhang   PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
7449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
7459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(C));
746a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
747a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7489566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
7499566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
7509566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
751a3f881fbSStefano Zampini   c->reallocs         = 0;
752076ba34aSJunchao Zhang   C->info.mallocs     = 0;
753a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
754a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
755a3f881fbSStefano Zampini   C->num_ass++;
756a3f881fbSStefano Zampini   PetscFunctionReturn(0);
757a3f881fbSStefano Zampini }
758a3f881fbSStefano Zampini 
759a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
760a3f881fbSStefano Zampini {
761076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
762076ba34aSJunchao Zhang   MatProductType                 ptype;
763076ba34aSJunchao Zhang   Mat                            A,B;
764076ba34aSJunchao Zhang   bool                           transA,transB;
765076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
766076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
767076ba34aSJunchao Zhang   MPI_Comm                       comm;
768076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
769a3f881fbSStefano Zampini 
770a3f881fbSStefano Zampini   PetscFunctionBegin;
771a3f881fbSStefano Zampini   MatCheckProduct(C,1);
7729566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C,&comm));
7735f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
774a3f881fbSStefano Zampini   A       = product->A;
775a3f881fbSStefano Zampini   B       = product->B;
7769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
7779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B));
778a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
779a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
780076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
781076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
782076ba34aSJunchao Zhang 
783a3f881fbSStefano Zampini   ptype   = product->type;
784a3f881fbSStefano Zampini   switch (ptype) {
785076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
786076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
787076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
788a3f881fbSStefano Zampini     default:
78998921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
790a3f881fbSStefano Zampini   }
791a3f881fbSStefano Zampini 
792076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
793076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
794076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
795076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
796a3f881fbSStefano Zampini 
797076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
798076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7996ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
8006ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
8016ffb9508SJunchao Zhang    */
802076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
803076ba34aSJunchao Zhang 
8049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
805076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
806076ba34aSJunchao Zhang   if (transA) {
8079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
808076ba34aSJunchao Zhang     transA = false;
809076ba34aSJunchao Zhang   }
810076ba34aSJunchao Zhang 
811076ba34aSJunchao Zhang   if (transB) {
8129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
813076ba34aSJunchao Zhang     transB = false;
814076ba34aSJunchao Zhang   }
815076ba34aSJunchao Zhang 
8169566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
817076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
818076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
819076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
820076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
821076ba34aSJunchao Zhang   */
8229566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
823*f98996d3SJunchao Zhang   PetscCallCXX(sort_crs_matrix(csrmatC));
8249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
825076ba34aSJunchao Zhang 
8269566063dSJacob Faibussowitsch   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
8279566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
828076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
829a3f881fbSStefano Zampini   PetscFunctionReturn(0);
830a3f881fbSStefano Zampini }
831a3f881fbSStefano Zampini 
832a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
833076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
834a3f881fbSStefano Zampini {
835076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
836a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
837a3f881fbSStefano Zampini 
838a3f881fbSStefano Zampini   PetscFunctionBegin;
839a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
8409566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
841a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
8429566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
843a3f881fbSStefano Zampini   }
844a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
845a3f881fbSStefano Zampini     switch (product->type) {
846a3f881fbSStefano Zampini     case MATPRODUCT_AB:
847a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
848a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
849a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
850a3f881fbSStefano Zampini       break;
851a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
852a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
853a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
854a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
855a3f881fbSStefano Zampini       break;
856a3f881fbSStefano Zampini     default:
857a3f881fbSStefano Zampini       break;
858a3f881fbSStefano Zampini     }
859a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
8609566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
861a3f881fbSStefano Zampini   }
862a3f881fbSStefano Zampini   PetscFunctionReturn(0);
863a3f881fbSStefano Zampini }
864a587d139SMark 
865f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
866f0cf5187SStefano Zampini {
867f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
868f0cf5187SStefano Zampini 
869f0cf5187SStefano Zampini   PetscFunctionBegin;
8709566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
8719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
872f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
873076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
8749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
8759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
8769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
877f0cf5187SStefano Zampini   PetscFunctionReturn(0);
878f0cf5187SStefano Zampini }
879f0cf5187SStefano Zampini 
880a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
881a587d139SMark {
882076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
883a587d139SMark 
884a587d139SMark   PetscFunctionBegin;
885076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8862328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
887076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
8889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(A));
8892328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8909566063dSJacob Faibussowitsch     PetscCall(MatZeroEntries_SeqAIJ(A));
8912328674fSJunchao Zhang   }
892a587d139SMark   PetscFunctionReturn(0);
893a587d139SMark }
894a587d139SMark 
895f78ce678SMark Adams static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)
896f78ce678SMark Adams {
897f78ce678SMark Adams   Mat_SeqAIJ                   *aijseq;
898f78ce678SMark Adams   Mat_SeqAIJKokkos             *aijkok;
899f78ce678SMark Adams   PetscInt                     n;
900f78ce678SMark Adams   PetscScalarKokkosView        xv;
901f78ce678SMark Adams 
902f78ce678SMark Adams   PetscFunctionBegin;
903f78ce678SMark Adams   PetscCall(VecGetLocalSize(x,&n));
904f78ce678SMark Adams   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
905f78ce678SMark Adams   PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
906f78ce678SMark Adams 
907f78ce678SMark Adams   PetscCall(MatSeqAIJKokkosSyncDevice(A));
908f78ce678SMark Adams   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
909f78ce678SMark Adams 
910f78ce678SMark Adams   if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */
911f78ce678SMark Adams     PetscCall(MatMarkDiagonal_SeqAIJ(A));
912f78ce678SMark Adams     aijseq = static_cast<Mat_SeqAIJ*>(A->data);
913f78ce678SMark Adams     aijkok->SetDiagonal(aijseq->diag);
914f78ce678SMark Adams   }
915f78ce678SMark Adams 
916f78ce678SMark Adams   const auto& Aa = aijkok->a_dual.view_device();
917f78ce678SMark Adams   const auto& Ai = aijkok->i_dual.view_device();
918f78ce678SMark Adams   const auto& Adiag = aijkok->diag_dual.view_device();
919f78ce678SMark Adams 
920f78ce678SMark Adams   PetscCall(VecGetKokkosViewWrite(x,&xv));
921f78ce678SMark Adams   Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) {
922f78ce678SMark Adams     if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i));
923f78ce678SMark Adams     else xv(i) = 0;
924f78ce678SMark Adams   });
925f78ce678SMark Adams   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
926f78ce678SMark Adams   PetscFunctionReturn(0);
927f78ce678SMark Adams }
928f78ce678SMark Adams 
929db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
93042550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
931db78de30SJunchao Zhang {
932db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
933db78de30SJunchao Zhang 
934db78de30SJunchao Zhang   PetscFunctionBegin;
935db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
936db78de30SJunchao Zhang   PetscValidPointer(kv,2);
937db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
939db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
940076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
941db78de30SJunchao Zhang   PetscFunctionReturn(0);
942db78de30SJunchao Zhang }
943db78de30SJunchao Zhang 
94442550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
945db78de30SJunchao Zhang {
946db78de30SJunchao Zhang   PetscFunctionBegin;
947db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
948db78de30SJunchao Zhang   PetscValidPointer(kv,2);
949db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
950db78de30SJunchao Zhang   PetscFunctionReturn(0);
951db78de30SJunchao Zhang }
952db78de30SJunchao Zhang 
95342550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
954db78de30SJunchao Zhang {
955db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
956db78de30SJunchao Zhang 
957db78de30SJunchao Zhang   PetscFunctionBegin;
958db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
959db78de30SJunchao Zhang   PetscValidPointer(kv,2);
960db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
962db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
963076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
964db78de30SJunchao Zhang   PetscFunctionReturn(0);
965db78de30SJunchao Zhang }
966db78de30SJunchao Zhang 
96742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
968db78de30SJunchao Zhang {
969db78de30SJunchao Zhang   PetscFunctionBegin;
970db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
971db78de30SJunchao Zhang   PetscValidPointer(kv,2);
972db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
974db78de30SJunchao Zhang   PetscFunctionReturn(0);
975db78de30SJunchao Zhang }
976db78de30SJunchao Zhang 
97742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
978db78de30SJunchao Zhang {
979db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
980db78de30SJunchao Zhang 
981db78de30SJunchao Zhang   PetscFunctionBegin;
982db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
983db78de30SJunchao Zhang   PetscValidPointer(kv,2);
984db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
985db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
986076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
987db78de30SJunchao Zhang   PetscFunctionReturn(0);
988db78de30SJunchao Zhang }
989db78de30SJunchao Zhang 
99042550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
991db78de30SJunchao Zhang {
992db78de30SJunchao Zhang   PetscFunctionBegin;
993db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
994db78de30SJunchao Zhang   PetscValidPointer(kv,2);
995db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosModifyDevice(A));
997db78de30SJunchao Zhang   PetscFunctionReturn(0);
998db78de30SJunchao Zhang }
999db78de30SJunchao Zhang 
1000c17cf699SJunchao Zhang /* Computes Y += alpha X */
1001c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
1002a587d139SMark {
1003a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
1004c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
1005c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
1006c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
1007a587d139SMark 
1008a587d139SMark   PetscFunctionBegin;
1009c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
1010c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
10119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
10129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(X));
10139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1014db78de30SJunchao Zhang 
1015c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1016c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
1017a587d139SMark     PetscBool e;
10189566063dSJacob Faibussowitsch     PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
1019a587d139SMark     if (e) {
10209566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e));
1021c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
1022a587d139SMark     }
1023a587d139SMark   }
1024db78de30SJunchao Zhang 
1025c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1026c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1027c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1028c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1029c17cf699SJunchao Zhang   */
1030c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
1031c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
1032c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
1033c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
1034c17cf699SJunchao Zhang 
1035c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1036c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
10379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1038c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1039c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1040c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1041c17cf699SJunchao Zhang 
1042c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1043c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1044c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1045c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1046c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1047c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1048c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1049c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1050c17cf699SJunchao Zhang             q++;
1051a587d139SMark           } else {
1052c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1053c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1054c17cf699SJunchao Zhang             */
10558b8b16f9SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) =
10568b8b16f9SJunchao Zhang            #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99)
10578b8b16f9SJunchao Zhang               Kokkos::nan("1"); /* auto promote the double NaN if needed */
10588b8b16f9SJunchao Zhang            #else
10598b8b16f9SJunchao Zhang               Kokkos::Experimental::nan("1");
10608b8b16f9SJunchao Zhang            #endif
1061a587d139SMark           }
1062c17cf699SJunchao Zhang         }
1063c17cf699SJunchao Zhang       });
1064c17cf699SJunchao Zhang     });
10659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1066c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1067c17cf699SJunchao Zhang     Mat             Z;
1068c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1069c17cf699SJunchao Zhang     KernelHandle    kh;
1070c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1071c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1072c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1073c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
10749566063dSJacob Faibussowitsch     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
10759566063dSJacob Faibussowitsch     PetscCall(MatHeaderReplace(Y,&Z));
1076c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1077c17cf699SJunchao Zhang   }
10789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
10799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1080a587d139SMark   PetscFunctionReturn(0);
1081a587d139SMark }
1082a587d139SMark 
1083e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
108442550becSJunchao Zhang {
108542550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
108642550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
108742550becSJunchao Zhang 
108842550becSJunchao Zhang   PetscFunctionBegin;
10899566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1090394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
109142550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1092cbc6b225SStefano Zampini   delete akok;
1093cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
10949566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1095394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
109642550becSJunchao Zhang   PetscFunctionReturn(0);
109742550becSJunchao Zhang }
109842550becSJunchao Zhang 
109942550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
110042550becSJunchao Zhang {
110142550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
110242550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1103394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1104394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1105394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
110642550becSJunchao Zhang   MatScalarKokkosView         Aa;
110742550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
110842550becSJunchao Zhang   PetscMemType                memtype;
110942550becSJunchao Zhang 
111042550becSJunchao Zhang   PetscFunctionBegin;
11119566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v,&memtype));
111242550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1113394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
111442550becSJunchao Zhang   } else {
1115394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
111642550becSJunchao Zhang   }
111742550becSJunchao Zhang 
1118c7b718f4SJunchao Zhang   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1119c7b718f4SJunchao Zhang   else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
112042550becSJunchao Zhang 
1121c7b718f4SJunchao Zhang   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1122c7b718f4SJunchao Zhang     PetscScalar sum = 0.0;
1123c7b718f4SJunchao Zhang     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1124c7b718f4SJunchao Zhang     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
1125c7b718f4SJunchao Zhang   });
1126394ed5ebSJunchao Zhang 
11279566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
11289566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa));
112942550becSJunchao Zhang   PetscFunctionReturn(0);
113042550becSJunchao Zhang }
113142550becSJunchao Zhang 
11325fbaff96SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag)
11335fbaff96SJunchao Zhang {
11345fbaff96SJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
11355fbaff96SJunchao Zhang   MatScalarKokkosView         Aa;
11365fbaff96SJunchao Zhang   const MatRowMapKokkosView&  Ai = akok->i_dual.view_device();
11375fbaff96SJunchao Zhang   PetscInt                    m = A->rmap->n;
11385fbaff96SJunchao Zhang   ConstMatRowMapKokkosView    Adiag(diag,m); /* diag is a device pointer */
11395fbaff96SJunchao Zhang 
11405fbaff96SJunchao Zhang   PetscFunctionBegin;
11415fbaff96SJunchao Zhang   PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa));
11425fbaff96SJunchao Zhang   Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) {
11435fbaff96SJunchao Zhang     PetscScalar tmp;
11445fbaff96SJunchao Zhang     if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */
11455fbaff96SJunchao Zhang       tmp          = Aa(Ai(i));
11465fbaff96SJunchao Zhang       Aa(Ai(i))    = Aa(Adiag(i));
11475fbaff96SJunchao Zhang       Aa(Adiag(i)) = tmp;
11485fbaff96SJunchao Zhang     }
11495fbaff96SJunchao Zhang   });
11505fbaff96SJunchao Zhang   PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
11515fbaff96SJunchao Zhang   PetscFunctionReturn(0);
11525fbaff96SJunchao Zhang }
11535fbaff96SJunchao Zhang 
115486a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11558f7e8f9dSMark Adams {
11568f7e8f9dSMark Adams   PetscFunctionBegin;
11579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(A));
11589566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
11598f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11608f7e8f9dSMark Adams   PetscFunctionReturn(0);
11618f7e8f9dSMark Adams }
11628f7e8f9dSMark Adams 
11638c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11648c3ff71bSJunchao Zhang {
1165076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1166076ba34aSJunchao Zhang 
11678c3ff71bSJunchao Zhang   PetscFunctionBegin;
1168076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11696f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11706f3d89d0SStefano Zampini 
11718c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11728c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11738c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1174a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1175f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1176a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1177076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11788c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11798c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11808c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11818c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11828c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11838c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1184076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11850ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1186152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1187f78ce678SMark Adams   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1188076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1189076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1190076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1191076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1192076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1193076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
11947ee59b9bSJunchao Zhang   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
119542550becSJunchao Zhang 
11969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
11979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1198076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1199076ba34aSJunchao Zhang }
1200076ba34aSJunchao Zhang 
1201076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1202076ba34aSJunchao Zhang {
1203076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1204076ba34aSJunchao Zhang   PetscInt    i,m,n;
1205076ba34aSJunchao Zhang 
1206076ba34aSJunchao Zhang   PetscFunctionBegin;
12075f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1208076ba34aSJunchao Zhang 
1209076ba34aSJunchao Zhang   m = akok->nrows();
1210076ba34aSJunchao Zhang   n = akok->ncols();
12119566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(A,m,n,m,n));
12129566063dSJacob Faibussowitsch   PetscCall(MatSetType(A,MATSEQAIJKOKKOS));
1213076ba34aSJunchao Zhang 
1214076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
12159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1216076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1217076ba34aSJunchao Zhang 
1218076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1219076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1220076ba34aSJunchao Zhang 
1221076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1222076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1223076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1224076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1225076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1226076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1227076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1228076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1229076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1230076ba34aSJunchao Zhang 
12319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->imax));
12329566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&aseq->ilen));
1233076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1234076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1235076ba34aSJunchao Zhang   }
1236076ba34aSJunchao Zhang 
1237076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1238076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1239ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
12409566063dSJacob Faibussowitsch   PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
12419566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1242076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1243076ba34aSJunchao Zhang }
1244076ba34aSJunchao Zhang 
1245076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1246076ba34aSJunchao Zhang 
1247076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1248076ba34aSJunchao Zhang  */
1249076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1250076ba34aSJunchao Zhang {
1251076ba34aSJunchao Zhang   PetscFunctionBegin;
12529566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
12539566063dSJacob Faibussowitsch   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
12548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12558c3ff71bSJunchao Zhang }
12568c3ff71bSJunchao Zhang 
12578c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1258152b3e56SJunchao Zhang /*@C
12598c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12608c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12618c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12628c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12638c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12648c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12658c3ff71bSJunchao Zhang 
12668c3ff71bSJunchao Zhang    Collective
12678c3ff71bSJunchao Zhang 
12688c3ff71bSJunchao Zhang    Input Parameters:
12698c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12708c3ff71bSJunchao Zhang .  m - number of rows
12718c3ff71bSJunchao Zhang .  n - number of columns
12728c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12738c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12748c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12758c3ff71bSJunchao Zhang 
12768c3ff71bSJunchao Zhang    Output Parameter:
12778c3ff71bSJunchao Zhang .  A - the matrix
12788c3ff71bSJunchao Zhang 
12798c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12808c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12818c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12828c3ff71bSJunchao Zhang 
12838c3ff71bSJunchao Zhang    Notes:
12848c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12858c3ff71bSJunchao Zhang 
12868c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12878c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12888c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12898c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12908c3ff71bSJunchao Zhang 
12918c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12928c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12938c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12948c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12958c3ff71bSJunchao Zhang 
12968c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12978c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12988c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12998c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
13008c3ff71bSJunchao Zhang 
13018c3ff71bSJunchao Zhang    Level: intermediate
13028c3ff71bSJunchao Zhang 
1303db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`
13048c3ff71bSJunchao Zhang @*/
13058c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
13068c3ff71bSJunchao Zhang {
13078c3ff71bSJunchao Zhang   PetscFunctionBegin;
13089566063dSJacob Faibussowitsch   PetscCall(PetscKokkosInitializeCheck());
13099566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
13109566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
13119566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJKOKKOS));
13129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
13138c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13148c3ff71bSJunchao Zhang }
1315930e68a5SMark Adams 
13168f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
13178f7e8f9dSMark Adams //
131846804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
13198f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
13208f7e8f9dSMark Adams //
13218f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1322930e68a5SMark Adams {
13238f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
13248f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13258f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
13268f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1327300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1328076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1329076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
13308f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
133146804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1332930e68a5SMark Adams 
1333930e68a5SMark Adams   PetscFunctionBegin;
13342c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1335b94d7dedSBarry Smith   PetscCall(MatIsStructurallySymmetric(A,&row_identity));
13362c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
13379566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&r_h));
13389566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol,&ic_h));
13399566063dSJacob Faibussowitsch   PetscCall(ISGetSize(isicol,&nc));
13409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
13419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
13428f7e8f9dSMark Adams   {
13438f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13448f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13458f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
13468f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13478f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13488f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13498f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13508f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13518f7e8f9dSMark Adams     size_t flops_h = 0.0;
13528f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13538f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13548f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13558f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13568f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13578f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13588f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13598f7e8f9dSMark Adams     // Fill A --> fact
13608f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1361042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13628f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13638f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13648f7e8f9dSMark Adams         // zero rows of B
13658f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13668f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13678f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13688f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13698f7e8f9dSMark Adams             /* zero (unfactored row) */
13708f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13718f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13728f7e8f9dSMark Adams           });
13738f7e8f9dSMark Adams         // copy A into B
13748f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13758f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13768f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13778f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13788f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13798f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13808f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13818f7e8f9dSMark Adams               PetscScalar vala = av[j];
13828f7e8f9dSMark Adams               if (colb == rowb) {
13838f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13848f7e8f9dSMark Adams               } else {
13858f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13868f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13878f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13888f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13898f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13908f7e8f9dSMark Adams                     pba[j] = vala;
13918f7e8f9dSMark Adams                     set++;
13928f7e8f9dSMark Adams                     break;
13938f7e8f9dSMark Adams                   }
13948f7e8f9dSMark Adams                 }
13958f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13968f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13978f1da0b2SJunchao Zhang                #endif
13988f7e8f9dSMark Adams               }
13998f7e8f9dSMark Adams             }
14008f7e8f9dSMark Adams           });
14018f7e8f9dSMark Adams       });
14028f7e8f9dSMark Adams     Kokkos::fence();
1403930e68a5SMark Adams 
14048f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
14058f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14068f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14078f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1408042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
14098f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
14108f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
14118f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
14128f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
14138f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
14148f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
14158f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
14168f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
14178f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
14188f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
14198f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
14208f7e8f9dSMark Adams               else {
14218f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
14228f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
14238f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
14248f7e8f9dSMark Adams                 size_t         st_idx;
14258f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
14268f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14278f7e8f9dSMark Adams                 // get column, there has got to be a better way
14288f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
14298f7e8f9dSMark Adams                     if (pjL[j] == ii) {
14308f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
14318f7e8f9dSMark Adams                       idx = j; // output
14328f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14338f7e8f9dSMark Adams                     }
14348f7e8f9dSMark Adams                 }, st_idx);
14358f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
14368f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
143799551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
143899551766SMark Adams #endif
143999551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14408f7e8f9dSMark Adams                 // U(i+1,:end)
14418f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
14428f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
14438f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
14448f7e8f9dSMark Adams                       if (col==myk) {
14458f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
14468f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14478f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14488f7e8f9dSMark Adams                       } else {
14498f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14508f7e8f9dSMark Adams                         PetscInt       high, low;
14518f7e8f9dSMark Adams                         const PetscInt *startj;
14528f7e8f9dSMark Adams                         if (col<myk) { // L
14538f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14548f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14558f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14568f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14578f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14588f7e8f9dSMark Adams                         } else {
14598f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14608f7e8f9dSMark Adams                           start = ba_d + idx;
14618f7e8f9dSMark Adams                           startj= bj_d + idx;
14628f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14638f7e8f9dSMark Adams                         }
14648f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14658f7e8f9dSMark Adams                         low  = 0;
14668f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14678f7e8f9dSMark Adams                         while (high-low > 5) {
14688f7e8f9dSMark Adams                           int t = (low+high)/2;
14698f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14708f7e8f9dSMark Adams                           else                 low  = t;
14718f7e8f9dSMark Adams                         }
14728f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14738f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14748f7e8f9dSMark Adams                         }
14758f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
147699551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
147799551766SMark Adams #endif
14788f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14798f7e8f9dSMark Adams                       }
14808f7e8f9dSMark Adams                     });
14818f7e8f9dSMark Adams               }
14828f7e8f9dSMark Adams             });
14838f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14848f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14858f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14868f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14878f7e8f9dSMark Adams       });
14888f7e8f9dSMark Adams     Kokkos::fence();
14898f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
14918f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14928f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14938f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14948f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14958f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14968f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14978f7e8f9dSMark Adams             if (i < end) {
14988f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14998f7e8f9dSMark Adams               *pv = 1.0/(*pv);
15008f7e8f9dSMark Adams             }
15018f7e8f9dSMark Adams           });
15028f7e8f9dSMark Adams       });
15038f7e8f9dSMark Adams   }
15049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15059566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol,&ic_h));
15069566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&r_h));
15078f7e8f9dSMark Adams 
15089566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
15099566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isicol,&col_identity));
15108f7e8f9dSMark Adams   if (b->inode.size) {
15118f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
15128f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
15138f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
15148f7e8f9dSMark Adams   } else {
15158f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15168f7e8f9dSMark Adams   }
15178f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
15198f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15208f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15218f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15228f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15238f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15248f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15258f7e8f9dSMark Adams 
1526930e68a5SMark Adams   PetscFunctionReturn(0);
1527930e68a5SMark Adams }
1528930e68a5SMark Adams 
152986a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1530930e68a5SMark Adams {
1531930e68a5SMark Adams   PetscFunctionBegin;
15329566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
153386a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
153486a27549SJunchao Zhang   PetscFunctionReturn(0);
153586a27549SJunchao Zhang }
153686a27549SJunchao Zhang 
153786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
153886a27549SJunchao Zhang {
153986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
154086a27549SJunchao Zhang 
154186a27549SJunchao Zhang   PetscFunctionBegin;
154286a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
154386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
154486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
154586a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
154686a27549SJunchao Zhang   }
154786a27549SJunchao Zhang   PetscFunctionReturn(0);
154886a27549SJunchao Zhang }
154986a27549SJunchao Zhang 
155086a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
155186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
155286a27549SJunchao Zhang {
155386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1554076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
155586a27549SJunchao Zhang 
155686a27549SJunchao Zhang   PetscFunctionBegin;
155786a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
155886a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1559076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
156086a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1561076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1562076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
156386a27549SJunchao Zhang 
1564*f98996d3SJunchao Zhang     transpose_matrix<
1565076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1566076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1567076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
156886a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
156986a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
157086a27549SJunchao Zhang 
157186a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
157286a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
157386a27549SJunchao Zhang     */
1574*f98996d3SJunchao Zhang     sort_crs_matrix<DefaultExecutionSpace,
1575076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
157686a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
157786a27549SJunchao Zhang 
157886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
157986a27549SJunchao Zhang 
158086a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1581076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
158286a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1583076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1584076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
158586a27549SJunchao Zhang 
1586*f98996d3SJunchao Zhang     transpose_matrix<
1587076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1588076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1589076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
159086a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
159186a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
159286a27549SJunchao Zhang 
159386a27549SJunchao Zhang     /* Sort indices. See comments above */
1594*f98996d3SJunchao Zhang     sort_crs_matrix<DefaultExecutionSpace,
1595076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
159686a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
159786a27549SJunchao Zhang 
159886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
159986a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
160086a27549SJunchao Zhang   }
160186a27549SJunchao Zhang   PetscFunctionReturn(0);
160286a27549SJunchao Zhang }
160386a27549SJunchao Zhang 
160486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
160586a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
160686a27549SJunchao Zhang {
160786a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
160886a27549SJunchao Zhang   PetscScalarKokkosView          xv;
160986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
161086a27549SJunchao Zhang 
161186a27549SJunchao Zhang   PetscFunctionBegin;
16129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A));
16149566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
16159566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
161686a27549SJunchao Zhang   /* Solve L tmpv = b */
16179566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
161886a27549SJunchao Zhang   /* Solve Ux = tmpv */
16199566063dSJacob Faibussowitsch   PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
16209566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
16219566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
16229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
162386a27549SJunchao Zhang   PetscFunctionReturn(0);
162486a27549SJunchao Zhang }
162586a27549SJunchao Zhang 
1626076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
162786a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
162886a27549SJunchao Zhang {
162986a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
163086a27549SJunchao Zhang   PetscScalarKokkosView          xv;
163186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
163286a27549SJunchao Zhang 
163386a27549SJunchao Zhang   PetscFunctionBegin;
16349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A));
16369566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosView(b,&bv));
16379566063dSJacob Faibussowitsch   PetscCall(VecGetKokkosViewWrite(x,&xv));
163886a27549SJunchao Zhang   /* Solve U^T tmpv = b */
163986a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
164086a27549SJunchao Zhang 
164186a27549SJunchao Zhang   /* Solve L^T x = tmpv */
164286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
16439566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosView(b,&bv));
16449566063dSJacob Faibussowitsch   PetscCall(VecRestoreKokkosViewWrite(x,&xv));
16459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
164686a27549SJunchao Zhang   PetscFunctionReturn(0);
164786a27549SJunchao Zhang }
164886a27549SJunchao Zhang 
164986a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
165086a27549SJunchao Zhang {
165186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
165286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
165386a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
165486a27549SJunchao Zhang 
165586a27549SJunchao Zhang   PetscFunctionBegin;
16569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
16579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1658076ba34aSJunchao Zhang 
1659076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1660076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1661076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1662076ba34aSJunchao Zhang 
1663076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
166486a27549SJunchao Zhang 
166586a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
166686a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
166786a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
166886a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
166986a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
167086a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
167186a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
167286a27549SJunchao Zhang 
167386a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
167486a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
167586a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1676eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
16779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
167886a27549SJunchao Zhang   PetscFunctionReturn(0);
167986a27549SJunchao Zhang }
168086a27549SJunchao Zhang 
168186a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
168286a27549SJunchao Zhang {
168386a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
168486a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
168586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
168686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
168786a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
168886a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
168986a27549SJunchao Zhang 
169086a27549SJunchao Zhang   PetscFunctionBegin;
16919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A));
169286a27549SJunchao Zhang   /* Rebuild factors */
169386a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
169486a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
169586a27549SJunchao Zhang 
169686a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
169786a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
169886a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
169986a27549SJunchao Zhang 
170086a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
170186a27549SJunchao Zhang 
170286a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
170386a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
170486a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
170586a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
170686a27549SJunchao Zhang 
170786a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1708076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1709076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1710076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
171186a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
171286a27549SJunchao Zhang 
171386a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
171486a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
171586a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
171686a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
171786a27549SJunchao Zhang 
171886a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
171986a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
172086a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
172186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
172286a27549SJunchao Zhang  #else
172386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
172486a27549SJunchao Zhang  #endif
172586a27549SJunchao Zhang 
172686a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
172786a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
172886a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
172986a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
173086a27549SJunchao Zhang 
173186a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
17329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
173386a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
173486a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
173586a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
173686a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
173786a27549SJunchao Zhang 
173886a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
173986a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1740930e68a5SMark Adams   PetscFunctionReturn(0);
1741930e68a5SMark Adams }
1742930e68a5SMark Adams 
17438f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17448f7e8f9dSMark Adams {
17458f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17468f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1747930e68a5SMark Adams 
17488f7e8f9dSMark Adams   PetscFunctionBegin;
17499566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
17508f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17518f7e8f9dSMark Adams   // move B data into Kokkos
17529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
17539566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
17548f7e8f9dSMark Adams   {
17558f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1756300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17578f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1758300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1759300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17608f7e8f9dSMark Adams     }
17618f7e8f9dSMark Adams   }
17628f7e8f9dSMark Adams   PetscFunctionReturn(0);
17638f7e8f9dSMark Adams }
17648f7e8f9dSMark Adams 
176586a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1766930e68a5SMark Adams {
1767930e68a5SMark Adams   PetscFunctionBegin;
1768930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1769930e68a5SMark Adams   PetscFunctionReturn(0);
1770930e68a5SMark Adams }
1771930e68a5SMark Adams 
17728f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17738f7e8f9dSMark Adams {
17748f7e8f9dSMark Adams   PetscFunctionBegin;
17758f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17768f7e8f9dSMark Adams   PetscFunctionReturn(0);
17778f7e8f9dSMark Adams }
17788f7e8f9dSMark Adams 
1779930e68a5SMark Adams /*MC
178086a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
178186a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1782930e68a5SMark Adams 
1783930e68a5SMark Adams   Level: beginner
1784930e68a5SMark Adams 
1785db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
1786930e68a5SMark Adams M*/
178786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1788930e68a5SMark Adams {
1789930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1790930e68a5SMark Adams 
1791930e68a5SMark Adams   PetscFunctionBegin;
17929566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
17939566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1794930e68a5SMark Adams   (*B)->factortype = ftype;
17959566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
17969566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
1797930e68a5SMark Adams 
17988f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17999566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
180086a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
180186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
180286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
18039566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
180486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
180586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
180698921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1807930e68a5SMark Adams 
18089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
18099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1810930e68a5SMark Adams   PetscFunctionReturn(0);
1811930e68a5SMark Adams }
18128f7e8f9dSMark Adams 
18138f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
18148f7e8f9dSMark Adams {
18158f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
18168f7e8f9dSMark Adams 
18178f7e8f9dSMark Adams   PetscFunctionBegin;
18189566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
18199566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
18208f7e8f9dSMark Adams   (*B)->factortype = ftype;
1821f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
18229566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
18239566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJKOKKOS));
18248f7e8f9dSMark Adams 
18258f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18269566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
18278f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
18288f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
18298f7e8f9dSMark Adams 
18309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
18319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
18328f7e8f9dSMark Adams   PetscFunctionReturn(0);
18338f7e8f9dSMark Adams }
183486a27549SJunchao Zhang 
183586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
183686a27549SJunchao Zhang {
183786a27549SJunchao Zhang   PetscFunctionBegin;
18389566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
18399566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
18409566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
184186a27549SJunchao Zhang   PetscFunctionReturn(0);
184286a27549SJunchao Zhang }
184386a27549SJunchao Zhang 
1844076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1845076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1846076ba34aSJunchao Zhang {
1847076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1848076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1849076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1850076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1851076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1852076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1853076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1854076ba34aSJunchao Zhang 
1855076ba34aSJunchao Zhang   PetscFunctionBegin;
18569566063dSJacob Faibussowitsch   PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1857076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
18589566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1859076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
186063a3b9bcSJacob Faibussowitsch       PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p])));
1861076ba34aSJunchao Zhang     }
18629566063dSJacob Faibussowitsch     PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n"));
1863076ba34aSJunchao Zhang   }
1864076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1865076ba34aSJunchao Zhang }
1866