xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 3f3ba80a9d957b25882ae9e1deed299d162b7bce)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
612c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
622c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
632c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
842328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   PetscFunctionReturn(0);
8786a27549SJunchao Zhang }
8886a27549SJunchao Zhang 
89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90f0cf5187SStefano Zampini {
91f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92f0cf5187SStefano Zampini 
93f0cf5187SStefano Zampini   PetscFunctionBegin;
94f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9586a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
962c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
99f0cf5187SStefano Zampini   PetscFunctionReturn(0);
100f0cf5187SStefano Zampini }
101f0cf5187SStefano Zampini 
102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103f0cf5187SStefano Zampini {
104076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105f0cf5187SStefano Zampini 
106f0cf5187SStefano Zampini   PetscFunctionBegin;
107076ba34aSJunchao Zhang   if (aijkok) {
108076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
109076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
110076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
111076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112076ba34aSJunchao Zhang   }
113076ba34aSJunchao Zhang   PetscFunctionReturn(0);
114076ba34aSJunchao Zhang }
115076ba34aSJunchao Zhang 
116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117076ba34aSJunchao Zhang {
118076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119076ba34aSJunchao Zhang 
120076ba34aSJunchao Zhang   PetscFunctionBegin;
121076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
122076ba34aSJunchao Zhang   PetscFunctionReturn(0);
123076ba34aSJunchao Zhang }
124076ba34aSJunchao Zhang 
125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126076ba34aSJunchao Zhang {
127076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128076ba34aSJunchao Zhang 
129076ba34aSJunchao Zhang   PetscFunctionBegin;
1302328674fSJunchao Zhang   if (aijkok) {
131076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
132076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1332328674fSJunchao Zhang   } else {
1342328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1352328674fSJunchao Zhang   }
136076ba34aSJunchao Zhang   PetscFunctionReturn(0);
137076ba34aSJunchao Zhang }
138076ba34aSJunchao Zhang 
139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140076ba34aSJunchao Zhang {
141076ba34aSJunchao Zhang   PetscFunctionBegin;
142076ba34aSJunchao Zhang   *array = NULL;
143076ba34aSJunchao Zhang   PetscFunctionReturn(0);
144076ba34aSJunchao Zhang }
145076ba34aSJunchao Zhang 
146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147076ba34aSJunchao Zhang {
148076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang   PetscFunctionBegin;
1512328674fSJunchao Zhang   if (aijkok) {
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1552328674fSJunchao Zhang   }
156076ba34aSJunchao Zhang   PetscFunctionReturn(0);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160076ba34aSJunchao Zhang {
161076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162076ba34aSJunchao Zhang 
163076ba34aSJunchao Zhang   PetscFunctionBegin;
1642328674fSJunchao Zhang   if (aijkok) {
165076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
166076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1672328674fSJunchao Zhang   }
168f0cf5187SStefano Zampini   PetscFunctionReturn(0);
169f0cf5187SStefano Zampini }
170f0cf5187SStefano Zampini 
171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173a587d139SMark {
174042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176a587d139SMark 
177a587d139SMark   PetscFunctionBegin;
1782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181a587d139SMark   PetscFunctionReturn(0);
182a587d139SMark }
183a587d139SMark 
184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186a587d139SMark {
187042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188a587d139SMark 
189a587d139SMark   PetscFunctionBegin;
190a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
191a587d139SMark     *d_mat = aijkok->device_mat_d.data();
192a587d139SMark   } else {
193a587d139SMark     PetscErrorCode   ierr;
194a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195a587d139SMark     *d_mat  = NULL;
196a587d139SMark   }
197a587d139SMark   PetscFunctionReturn(0);
198a587d139SMark }
199076ba34aSJunchao Zhang 
200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202152b3e56SJunchao Zhang {
203152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204152b3e56SJunchao Zhang 
205152b3e56SJunchao Zhang   PetscFunctionBegin;
2062c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21286a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
213076ba34aSJunchao Zhang   }
214076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
215152b3e56SJunchao Zhang   PetscFunctionReturn(0);
216152b3e56SJunchao Zhang }
217152b3e56SJunchao Zhang 
218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220152b3e56SJunchao Zhang {
221eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
222152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223152b3e56SJunchao Zhang 
224152b3e56SJunchao Zhang   PetscFunctionBegin;
225eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2262c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
229076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
232076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
233076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234076ba34aSJunchao Zhang    #endif
23586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
236076ba34aSJunchao Zhang   }
237076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
238eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239152b3e56SJunchao Zhang   PetscFunctionReturn(0);
240152b3e56SJunchao Zhang }
241a587d139SMark 
2428c3ff71bSJunchao Zhang /* y = A x */
2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2448c3ff71bSJunchao Zhang {
2458c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2468c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
247152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
248152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang   PetscFunctionBegin;
2516af1d01cSJed Brown   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2528c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254ad7ac7b2SJunchao Zhang   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258ad7ac7b2SJunchao Zhang   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
259076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2616af1d01cSJed Brown   ierr   = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2638c3ff71bSJunchao Zhang }
2648c3ff71bSJunchao Zhang 
2658c3ff71bSJunchao Zhang /* y = A^T x */
2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2678c3ff71bSJunchao Zhang {
2688c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2698c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
270152b3e56SJunchao Zhang   const char                       *mode;
271152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
272152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
273076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang   PetscFunctionBegin;
276eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
280152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
281076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282152b3e56SJunchao Zhang     mode = "N";
283152b3e56SJunchao Zhang   } else {
284076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
286152b3e56SJunchao Zhang     mode = "T";
287152b3e56SJunchao Zhang   }
288076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
291076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2938c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2948c3ff71bSJunchao Zhang }
2958c3ff71bSJunchao Zhang 
2968c3ff71bSJunchao Zhang /* y = A^H x */
2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2988c3ff71bSJunchao Zhang {
2998c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3008c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
301152b3e56SJunchao Zhang   const char                       *mode;
302152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
303152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
304076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3058c3ff71bSJunchao Zhang 
3068c3ff71bSJunchao Zhang   PetscFunctionBegin;
307eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
311152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
312076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313152b3e56SJunchao Zhang     mode = "N";
314152b3e56SJunchao Zhang   } else {
315076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
317152b3e56SJunchao Zhang     mode = "C";
318152b3e56SJunchao Zhang   }
319076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
322076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3258c3ff71bSJunchao Zhang }
3268c3ff71bSJunchao Zhang 
3278c3ff71bSJunchao Zhang /* z = A x + y */
3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3298c3ff71bSJunchao Zhang {
3308c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3318c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
332152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
333152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3348c3ff71bSJunchao Zhang 
3358c3ff71bSJunchao Zhang   PetscFunctionBegin;
336eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3378c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3418c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3428c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3508c3ff71bSJunchao Zhang }
3518c3ff71bSJunchao Zhang 
3528c3ff71bSJunchao Zhang /* z = A^T x + y */
3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3548c3ff71bSJunchao Zhang {
3558c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3568c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
357152b3e56SJunchao Zhang   const char                       *mode;
358152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
359152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
360076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3618c3ff71bSJunchao Zhang 
3628c3ff71bSJunchao Zhang   PetscFunctionBegin;
363eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3688c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
369152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
370076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371152b3e56SJunchao Zhang     mode = "N";
372152b3e56SJunchao Zhang   } else {
373076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
375152b3e56SJunchao Zhang     mode = "T";
376152b3e56SJunchao Zhang   }
377076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
381076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3838c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3848c3ff71bSJunchao Zhang }
3858c3ff71bSJunchao Zhang 
3868c3ff71bSJunchao Zhang /* z = A^H x + y */
3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3888c3ff71bSJunchao Zhang {
3898c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3908c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
391152b3e56SJunchao Zhang   const char                       *mode;
392152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
393152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
394076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3958c3ff71bSJunchao Zhang 
3968c3ff71bSJunchao Zhang   PetscFunctionBegin;
397eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3988c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
4028c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
403152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
404076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405152b3e56SJunchao Zhang     mode = "N";
406152b3e56SJunchao Zhang   } else {
407076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
409152b3e56SJunchao Zhang     mode = "C";
410152b3e56SJunchao Zhang   }
411076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
415076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417152b3e56SJunchao Zhang   PetscFunctionReturn(0);
418152b3e56SJunchao Zhang }
419152b3e56SJunchao Zhang 
420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421152b3e56SJunchao Zhang {
422152b3e56SJunchao Zhang   PetscErrorCode            ierr;
423152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424152b3e56SJunchao Zhang 
425152b3e56SJunchao Zhang   PetscFunctionBegin;
426152b3e56SJunchao Zhang   switch (op) {
427152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
428152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
431152b3e56SJunchao Zhang       break;
432152b3e56SJunchao Zhang     default:
433152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434152b3e56SJunchao Zhang       break;
435152b3e56SJunchao Zhang   }
4368c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4378c3ff71bSJunchao Zhang }
4388c3ff71bSJunchao Zhang 
439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4418c3ff71bSJunchao Zhang {
4428c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
443076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4448c3ff71bSJunchao Zhang 
4458c3ff71bSJunchao Zhang   PetscFunctionBegin;
446a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4498c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4522c71b3e2SJacob Faibussowitsch     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4592c71b3e2SJacob Faibussowitsch       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4618c3ff71bSJunchao Zhang     }
462076ba34aSJunchao Zhang   }
4638c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4648c3ff71bSJunchao Zhang }
4658c3ff71bSJunchao Zhang 
466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468076ba34aSJunchao Zhang  */
469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4708c3ff71bSJunchao Zhang {
4718c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
472076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
473076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474076ba34aSJunchao Zhang   Mat                   mat;
4758c3ff71bSJunchao Zhang 
4768c3ff71bSJunchao Zhang   PetscFunctionBegin;
477076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479076ba34aSJunchao Zhang   mat  = *B;
480076ba34aSJunchao Zhang   if (A->assembled) {
481076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484076ba34aSJunchao Zhang     /* Now copy values to B if needed */
485076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
486076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
487076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
489076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
490076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
492076ba34aSJunchao Zhang       }
493076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
496076ba34aSJunchao Zhang     }
497076ba34aSJunchao Zhang     mat->spptr = bkok;
498076ba34aSJunchao Zhang   }
499076ba34aSJunchao Zhang 
500076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5058c3ff71bSJunchao Zhang }
5068c3ff71bSJunchao Zhang 
5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5080ecb592aSJunchao Zhang {
5090ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5100ecb592aSJunchao Zhang   Mat               At;
511ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5120ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5130ecb592aSJunchao Zhang 
5140ecb592aSJunchao Zhang   PetscFunctionBegin;
5150ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5160ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
518ff751488SJunchao Zhang     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5190ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5200ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5217a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5220ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5230ecb592aSJunchao Zhang     if ((*B)->assembled) {
5240ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5250ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5260ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5270ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5280ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5290ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5300ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5320ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5330ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5340ecb592aSJunchao Zhang   }
5350ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5360ecb592aSJunchao Zhang }
5370ecb592aSJunchao Zhang 
5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5398c3ff71bSJunchao Zhang {
5408c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5428c3ff71bSJunchao Zhang 
5438c3ff71bSJunchao Zhang   PetscFunctionBegin;
54486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5468c3ff71bSJunchao Zhang     delete aijkok;
54786a27549SJunchao Zhang   } else {
54886a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54986a27549SJunchao Zhang   }
550152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
55142550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
55242550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
553152b3e56SJunchao Zhang   A->spptr = NULL;
5548c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5558c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5568c3ff71bSJunchao Zhang }
5578c3ff71bSJunchao Zhang 
558*3f3ba80aSJunchao Zhang /*MC
559*3f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
560*3f3ba80aSJunchao Zhang 
561*3f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
562*3f3ba80aSJunchao Zhang 
563*3f3ba80aSJunchao Zhang    Options Database Keys:
564*3f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
565*3f3ba80aSJunchao Zhang 
566*3f3ba80aSJunchao Zhang   Level: beginner
567*3f3ba80aSJunchao Zhang 
568*3f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
569*3f3ba80aSJunchao Zhang M*/
57086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
57186a27549SJunchao Zhang {
57286a27549SJunchao Zhang   PetscErrorCode ierr;
57386a27549SJunchao Zhang 
57486a27549SJunchao Zhang   PetscFunctionBegin;
57586a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
57686a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
57786a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
57886a27549SJunchao Zhang   PetscFunctionReturn(0);
57986a27549SJunchao Zhang }
58086a27549SJunchao Zhang 
581076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
582076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
583a3f881fbSStefano Zampini {
584076ba34aSJunchao Zhang   PetscErrorCode               ierr;
585076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
586076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
587076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
588076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
589076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
590076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
591a3f881fbSStefano Zampini 
592a3f881fbSStefano Zampini   PetscFunctionBegin;
593076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
594076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
595076ba34aSJunchao Zhang   PetscValidPointer(C,4);
596076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
597076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
5982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
600076ba34aSJunchao Zhang 
601076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
602076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
603076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
604076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
605076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
606076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
607076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
608076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
609076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
610076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
611076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
612076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
613076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
614076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
615076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
616076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
617076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
618076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
619076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
620076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
621076ba34aSJunchao Zhang     ca = ca_dual.view_device();
622076ba34aSJunchao Zhang     ci = ci_dual.view_device();
623076ba34aSJunchao Zhang     cj = cj_dual.view_device();
624076ba34aSJunchao Zhang 
625076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
626076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
627076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
628076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
629076ba34aSJunchao Zhang 
630076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
631076ba34aSJunchao Zhang         ci(i) = coffset;
632076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
633076ba34aSJunchao Zhang       });
634076ba34aSJunchao Zhang 
635076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
636076ba34aSJunchao Zhang         if (k < alen) {
637076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
638076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
639076ba34aSJunchao Zhang         } else {
640076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
641076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
642076ba34aSJunchao Zhang         }
643076ba34aSJunchao Zhang       });
644076ba34aSJunchao Zhang     });
645076ba34aSJunchao Zhang     ca_dual.modify_device();
646076ba34aSJunchao Zhang     ci_dual.modify_device();
647076ba34aSJunchao Zhang     cj_dual.modify_device();
648076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
649076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
650076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
651076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
652076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
653076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
654076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
655076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
656076ba34aSJunchao Zhang 
657076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
658076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
659076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
660076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
661076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
662076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
663076ba34aSJunchao Zhang       });
664076ba34aSJunchao Zhang     });
665076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
666076ba34aSJunchao Zhang   }
667076ba34aSJunchao Zhang   PetscFunctionReturn(0);
668076ba34aSJunchao Zhang }
669076ba34aSJunchao Zhang 
670076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
671076ba34aSJunchao Zhang {
672076ba34aSJunchao Zhang   PetscFunctionBegin;
673076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
674a3f881fbSStefano Zampini   PetscFunctionReturn(0);
675a3f881fbSStefano Zampini }
676a3f881fbSStefano Zampini 
677a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
678a3f881fbSStefano Zampini {
679076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
680a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
681a3f881fbSStefano Zampini   Mat                            A,B;
682076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
683a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
684a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
685076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
686076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
687a3f881fbSStefano Zampini 
688a3f881fbSStefano Zampini   PetscFunctionBegin;
689a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6902c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
691076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
692076ba34aSJunchao Zhang 
693076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
694076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
695076ba34aSJunchao Zhang     PetscFunctionReturn(0);
696076ba34aSJunchao Zhang   }
697076ba34aSJunchao Zhang 
698076ba34aSJunchao Zhang   switch (product->type) {
699076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
700076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
701076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
702076ba34aSJunchao Zhang     default:
70398921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
704076ba34aSJunchao Zhang   }
705076ba34aSJunchao Zhang 
706a3f881fbSStefano Zampini   A     = product->A;
707a3f881fbSStefano Zampini   B     = product->B;
708a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
709a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
710a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
711a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
712a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
713076ba34aSJunchao Zhang 
7142c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
715076ba34aSJunchao Zhang 
716076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
717076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
718076ba34aSJunchao Zhang 
719076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
720076ba34aSJunchao Zhang   if (transA) {
721076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
722076ba34aSJunchao Zhang     transA = false;
723a3f881fbSStefano Zampini   }
724a3f881fbSStefano Zampini 
725076ba34aSJunchao Zhang   if (transB) {
726076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
727076ba34aSJunchao Zhang     transB = false;
728076ba34aSJunchao Zhang   }
729eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
730076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
731076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
732eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
733076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
734a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
735a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7367d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
737a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
7387d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
739a3f881fbSStefano Zampini   c->reallocs         = 0;
740076ba34aSJunchao Zhang   C->info.mallocs     = 0;
741a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
742a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
743a3f881fbSStefano Zampini   C->num_ass++;
744a3f881fbSStefano Zampini   PetscFunctionReturn(0);
745a3f881fbSStefano Zampini }
746a3f881fbSStefano Zampini 
747a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
748a3f881fbSStefano Zampini {
749a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
750076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
751076ba34aSJunchao Zhang   MatProductType                 ptype;
752076ba34aSJunchao Zhang   Mat                            A,B;
753076ba34aSJunchao Zhang   bool                           transA,transB;
754076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
755076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
756076ba34aSJunchao Zhang   MPI_Comm                       comm;
757076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
758a3f881fbSStefano Zampini 
759a3f881fbSStefano Zampini   PetscFunctionBegin;
760a3f881fbSStefano Zampini   MatCheckProduct(C,1);
761076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
7622c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
763a3f881fbSStefano Zampini   A       = product->A;
764a3f881fbSStefano Zampini   B       = product->B;
765a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
766a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
767a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
768a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
769076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
770076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
771076ba34aSJunchao Zhang 
772a3f881fbSStefano Zampini   ptype   = product->type;
773a3f881fbSStefano Zampini   switch (ptype) {
774076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
775076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
776076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
777a3f881fbSStefano Zampini     default:
77898921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
779a3f881fbSStefano Zampini   }
780a3f881fbSStefano Zampini 
781076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
782076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
783076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
784076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
785a3f881fbSStefano Zampini 
786076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
787076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7886ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7896ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7906ffb9508SJunchao Zhang    */
791076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
792076ba34aSJunchao Zhang 
793eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
794076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
795076ba34aSJunchao Zhang   if (transA) {
796076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
797076ba34aSJunchao Zhang     transA = false;
798076ba34aSJunchao Zhang   }
799076ba34aSJunchao Zhang 
800076ba34aSJunchao Zhang   if (transB) {
801076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
802076ba34aSJunchao Zhang     transB = false;
803076ba34aSJunchao Zhang   }
804076ba34aSJunchao Zhang 
805076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
806076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
807076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
808076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
809076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
810076ba34aSJunchao Zhang   */
811076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
812076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
813eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
814076ba34aSJunchao Zhang 
815076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
816076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
817076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
818a3f881fbSStefano Zampini   PetscFunctionReturn(0);
819a3f881fbSStefano Zampini }
820a3f881fbSStefano Zampini 
821a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
822076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
823a3f881fbSStefano Zampini {
824a3f881fbSStefano Zampini   PetscErrorCode ierr;
825076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
826a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
827a3f881fbSStefano Zampini 
828a3f881fbSStefano Zampini   PetscFunctionBegin;
829a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
830a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
831a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
832a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
833a3f881fbSStefano Zampini   }
834a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
835a3f881fbSStefano Zampini     switch (product->type) {
836a3f881fbSStefano Zampini     case MATPRODUCT_AB:
837a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
838a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
839a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
840a3f881fbSStefano Zampini       break;
841a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
842a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
843a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
844a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
845a3f881fbSStefano Zampini       break;
846a3f881fbSStefano Zampini     default:
847a3f881fbSStefano Zampini       break;
848a3f881fbSStefano Zampini     }
849a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
850a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
851a3f881fbSStefano Zampini   }
852a3f881fbSStefano Zampini   PetscFunctionReturn(0);
853a3f881fbSStefano Zampini }
854a587d139SMark 
855f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
856f0cf5187SStefano Zampini {
857f0cf5187SStefano Zampini   PetscErrorCode   ierr;
858f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
859f0cf5187SStefano Zampini 
860f0cf5187SStefano Zampini   PetscFunctionBegin;
861eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
862f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
863f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
864076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
865076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
866076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
8676af1d01cSJed Brown   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
868f0cf5187SStefano Zampini   PetscFunctionReturn(0);
869f0cf5187SStefano Zampini }
870f0cf5187SStefano Zampini 
871a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
872a587d139SMark {
873a587d139SMark   PetscErrorCode   ierr;
874076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
875a587d139SMark 
876a587d139SMark   PetscFunctionBegin;
877076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8782328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
879076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
880076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8812328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8822328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8832328674fSJunchao Zhang   }
884a587d139SMark   PetscFunctionReturn(0);
885a587d139SMark }
886a587d139SMark 
887db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
88842550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
889db78de30SJunchao Zhang {
890db78de30SJunchao Zhang   PetscErrorCode     ierr;
891db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
892db78de30SJunchao Zhang 
893db78de30SJunchao Zhang   PetscFunctionBegin;
894db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
895db78de30SJunchao Zhang   PetscValidPointer(kv,2);
896db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
897db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
898db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
899076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
900db78de30SJunchao Zhang   PetscFunctionReturn(0);
901db78de30SJunchao Zhang }
902db78de30SJunchao Zhang 
90342550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
904db78de30SJunchao Zhang {
905db78de30SJunchao Zhang   PetscFunctionBegin;
906db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
907db78de30SJunchao Zhang   PetscValidPointer(kv,2);
908db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
909db78de30SJunchao Zhang   PetscFunctionReturn(0);
910db78de30SJunchao Zhang }
911db78de30SJunchao Zhang 
91242550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
913db78de30SJunchao Zhang {
914db78de30SJunchao Zhang   PetscErrorCode     ierr;
915db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
916db78de30SJunchao Zhang 
917db78de30SJunchao Zhang   PetscFunctionBegin;
918db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
919db78de30SJunchao Zhang   PetscValidPointer(kv,2);
920db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
921db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
922db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
923076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
924db78de30SJunchao Zhang   PetscFunctionReturn(0);
925db78de30SJunchao Zhang }
926db78de30SJunchao Zhang 
92742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
928db78de30SJunchao Zhang {
929db78de30SJunchao Zhang   PetscErrorCode     ierr;
930db78de30SJunchao Zhang 
931db78de30SJunchao Zhang   PetscFunctionBegin;
932db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
933db78de30SJunchao Zhang   PetscValidPointer(kv,2);
934db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
935076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
936db78de30SJunchao Zhang   PetscFunctionReturn(0);
937db78de30SJunchao Zhang }
938db78de30SJunchao Zhang 
93942550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
940db78de30SJunchao Zhang {
941db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
942db78de30SJunchao Zhang 
943db78de30SJunchao Zhang   PetscFunctionBegin;
944db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945db78de30SJunchao Zhang   PetscValidPointer(kv,2);
946db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
948076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
949db78de30SJunchao Zhang   PetscFunctionReturn(0);
950db78de30SJunchao Zhang }
951db78de30SJunchao Zhang 
95242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
953db78de30SJunchao Zhang {
954db78de30SJunchao Zhang   PetscErrorCode     ierr;
955db78de30SJunchao Zhang 
956db78de30SJunchao Zhang   PetscFunctionBegin;
957db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
958db78de30SJunchao Zhang   PetscValidPointer(kv,2);
959db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
960076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
961db78de30SJunchao Zhang   PetscFunctionReturn(0);
962db78de30SJunchao Zhang }
963db78de30SJunchao Zhang 
964c17cf699SJunchao Zhang /* Computes Y += alpha X */
965c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
966a587d139SMark {
967a587d139SMark   PetscErrorCode             ierr;
968a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
969c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
970c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
971c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
972a587d139SMark 
973a587d139SMark   PetscFunctionBegin;
974c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
975c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
976c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
977c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
978eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
979db78de30SJunchao Zhang 
980c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
981c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
982a587d139SMark     PetscBool e;
983a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
984a587d139SMark     if (e) {
985a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
986c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
987a587d139SMark     }
988a587d139SMark   }
989db78de30SJunchao Zhang 
990c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
991c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
992c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
993c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
994c17cf699SJunchao Zhang   */
995c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
996c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
997c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
998c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
999c17cf699SJunchao Zhang 
1000c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
1001c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
1002c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1003c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1004c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1005c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1006c17cf699SJunchao Zhang 
1007c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1008c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1009c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1010c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1011c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1012c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1013c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1014c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1015c17cf699SJunchao Zhang             q++;
1016a587d139SMark           } else {
1017c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1018c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1019c17cf699SJunchao Zhang             */
1020c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1021a587d139SMark           }
1022c17cf699SJunchao Zhang         }
1023c17cf699SJunchao Zhang       });
1024c17cf699SJunchao Zhang     });
1025c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1026c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1027c17cf699SJunchao Zhang     Mat             Z;
1028c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1029c17cf699SJunchao Zhang     KernelHandle    kh;
1030c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1031c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1032c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1033c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1034c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1035c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1036c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1037c17cf699SJunchao Zhang   }
1038eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1039eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1040a587d139SMark   PetscFunctionReturn(0);
1041a587d139SMark }
1042a587d139SMark 
1043394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
104442550becSJunchao Zhang {
104542550becSJunchao Zhang   PetscErrorCode            ierr;
1046394ed5ebSJunchao Zhang   Mat                       newmat;
104742550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
104842550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
104942550becSJunchao Zhang 
105042550becSJunchao Zhang   PetscFunctionBegin;
1051394ed5ebSJunchao Zhang   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1052394ed5ebSJunchao Zhang   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1053394ed5ebSJunchao Zhang   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1054394ed5ebSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
105542550becSJunchao Zhang   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
105642550becSJunchao Zhang   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
105742550becSJunchao Zhang   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1058394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
105942550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1060394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
106142550becSJunchao Zhang   PetscFunctionReturn(0);
106242550becSJunchao Zhang }
106342550becSJunchao Zhang 
106442550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
106542550becSJunchao Zhang {
106642550becSJunchao Zhang   PetscErrorCode              ierr;
106742550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
106842550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1069394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1070394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1071394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
107242550becSJunchao Zhang   MatScalarKokkosView         Aa;
107342550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
107442550becSJunchao Zhang   PetscMemType                memtype;
107542550becSJunchao Zhang 
107642550becSJunchao Zhang   PetscFunctionBegin;
1077394ed5ebSJunchao Zhang   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
107842550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
107942550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1080394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
108142550becSJunchao Zhang   } else {
1082394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
108342550becSJunchao Zhang   }
108442550becSJunchao Zhang 
1085394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1086394ed5ebSJunchao Zhang   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
108742550becSJunchao Zhang 
1088394ed5ebSJunchao Zhang   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1089b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
1090b6c38306SJunchao Zhang     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1091b6c38306SJunchao Zhang     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
109242550becSJunchao Zhang   });
1093394ed5ebSJunchao Zhang 
1094394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1095394ed5ebSJunchao Zhang   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
109642550becSJunchao Zhang   PetscFunctionReturn(0);
109742550becSJunchao Zhang }
109842550becSJunchao Zhang 
109986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11008f7e8f9dSMark Adams {
11018f7e8f9dSMark Adams   PetscErrorCode ierr;
11028f7e8f9dSMark Adams 
11038f7e8f9dSMark Adams   PetscFunctionBegin;
11048f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
11058f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
11068f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11078f7e8f9dSMark Adams   PetscFunctionReturn(0);
11088f7e8f9dSMark Adams }
11098f7e8f9dSMark Adams 
11108c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11118c3ff71bSJunchao Zhang {
111242550becSJunchao Zhang   PetscErrorCode     ierr;
1113076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1114076ba34aSJunchao Zhang 
11158c3ff71bSJunchao Zhang   PetscFunctionBegin;
1116076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11176f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11186f3d89d0SStefano Zampini 
11198c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11208c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11218c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1122a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1123f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1124a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1125076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11268c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11278c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11288c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11298c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11308c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11318c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1132076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11330ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1134152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1135076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1136076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1137076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1138076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1139076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1140076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
114142550becSJunchao Zhang 
114242550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
114342550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1144076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1145076ba34aSJunchao Zhang }
1146076ba34aSJunchao Zhang 
1147076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1148076ba34aSJunchao Zhang {
1149076ba34aSJunchao Zhang   PetscErrorCode ierr;
1150076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1151076ba34aSJunchao Zhang   PetscInt       i,m,n;
1152076ba34aSJunchao Zhang 
1153076ba34aSJunchao Zhang   PetscFunctionBegin;
11542c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1155076ba34aSJunchao Zhang 
1156076ba34aSJunchao Zhang   m    = akok->nrows();
1157076ba34aSJunchao Zhang   n    = akok->ncols();
1158076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1159076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1160076ba34aSJunchao Zhang 
1161076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1162076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1163076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1164076ba34aSJunchao Zhang 
1165076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1166076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1167076ba34aSJunchao Zhang 
1168076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1169076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1170076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1171076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1172076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1173076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1174076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1175076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1176076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1177076ba34aSJunchao Zhang 
1178076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1179076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1180076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1181076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1182076ba34aSJunchao Zhang   }
1183076ba34aSJunchao Zhang 
1184076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1185076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1186ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1187076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1188076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1189076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1190076ba34aSJunchao Zhang }
1191076ba34aSJunchao Zhang 
1192076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1193076ba34aSJunchao Zhang 
1194076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1195076ba34aSJunchao Zhang  */
1196076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1197076ba34aSJunchao Zhang {
1198076ba34aSJunchao Zhang   PetscErrorCode ierr;
1199076ba34aSJunchao Zhang 
1200076ba34aSJunchao Zhang   PetscFunctionBegin;
1201076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1202076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
12038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12048c3ff71bSJunchao Zhang }
12058c3ff71bSJunchao Zhang 
12068c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1207152b3e56SJunchao Zhang /*@C
12088c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12098c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12108c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12118c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12128c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12138c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12148c3ff71bSJunchao Zhang 
12158c3ff71bSJunchao Zhang    Collective
12168c3ff71bSJunchao Zhang 
12178c3ff71bSJunchao Zhang    Input Parameters:
12188c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12198c3ff71bSJunchao Zhang .  m - number of rows
12208c3ff71bSJunchao Zhang .  n - number of columns
12218c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12228c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12238c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12248c3ff71bSJunchao Zhang 
12258c3ff71bSJunchao Zhang    Output Parameter:
12268c3ff71bSJunchao Zhang .  A - the matrix
12278c3ff71bSJunchao Zhang 
12288c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12298c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12308c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12318c3ff71bSJunchao Zhang 
12328c3ff71bSJunchao Zhang    Notes:
12338c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12348c3ff71bSJunchao Zhang 
12358c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12368c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12378c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12388c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12398c3ff71bSJunchao Zhang 
12408c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12418c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12428c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12438c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12448c3ff71bSJunchao Zhang 
12458c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12468c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12478c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12488c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12498c3ff71bSJunchao Zhang 
12508c3ff71bSJunchao Zhang    Level: intermediate
12518c3ff71bSJunchao Zhang 
1252076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
12538c3ff71bSJunchao Zhang @*/
12548c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12558c3ff71bSJunchao Zhang {
12568c3ff71bSJunchao Zhang   PetscErrorCode ierr;
12578c3ff71bSJunchao Zhang 
12588c3ff71bSJunchao Zhang   PetscFunctionBegin;
12598c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
12608c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
12618c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
12628c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
12638c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
12648c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12658c3ff71bSJunchao Zhang }
1266930e68a5SMark Adams 
12678f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12688f7e8f9dSMark Adams //
126946804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12708f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12718f7e8f9dSMark Adams //
12728f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1273930e68a5SMark Adams {
12748f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12758f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12768f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1277930e68a5SMark Adams   PetscErrorCode     ierr;
12788f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1279300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1280076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1281076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12828f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
128346804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1284930e68a5SMark Adams 
1285930e68a5SMark Adams   PetscFunctionBegin;
12862c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12878f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
12882c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12898f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12908f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12918f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12928f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12938f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12948f7e8f9dSMark Adams   {
12958f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12968f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12978f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12988f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12998f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13008f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13018f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13028f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13038f7e8f9dSMark Adams     size_t flops_h = 0.0;
13048f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13058f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13068f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13078f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13088f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13098f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13108f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13118f7e8f9dSMark Adams     // Fill A --> fact
13128f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1313042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13148f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13158f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13168f7e8f9dSMark Adams         // zero rows of B
13178f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13188f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13198f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13208f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13218f7e8f9dSMark Adams             /* zero (unfactored row) */
13228f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13238f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13248f7e8f9dSMark Adams           });
13258f7e8f9dSMark Adams         // copy A into B
13268f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13278f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13288f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13298f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13308f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13318f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13328f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13338f7e8f9dSMark Adams               PetscScalar vala = av[j];
13348f7e8f9dSMark Adams               if (colb == rowb) {
13358f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13368f7e8f9dSMark Adams               } else {
13378f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13388f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13398f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13408f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13418f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13428f7e8f9dSMark Adams                     pba[j] = vala;
13438f7e8f9dSMark Adams                     set++;
13448f7e8f9dSMark Adams                     break;
13458f7e8f9dSMark Adams                   }
13468f7e8f9dSMark Adams                 }
13478f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13488f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13498f1da0b2SJunchao Zhang                #endif
13508f7e8f9dSMark Adams               }
13518f7e8f9dSMark Adams             }
13528f7e8f9dSMark Adams           });
13538f7e8f9dSMark Adams       });
13548f7e8f9dSMark Adams     Kokkos::fence();
1355930e68a5SMark Adams 
13568f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13578f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13588f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13598f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1360042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13618f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13628f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13638f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13648f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13658f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13668f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13678f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13688f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13698f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13708f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13718f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13728f7e8f9dSMark Adams               else {
13738f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13748f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13758f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13768f7e8f9dSMark Adams                 size_t         st_idx;
13778f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13788f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13798f7e8f9dSMark Adams                 // get column, there has got to be a better way
13808f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13818f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13828f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13838f7e8f9dSMark Adams                       idx = j; // output
13848f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13858f7e8f9dSMark Adams                     }
13868f7e8f9dSMark Adams                 }, st_idx);
13878f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13888f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
138999551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
139099551766SMark Adams #endif
139199551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13928f7e8f9dSMark Adams                 // U(i+1,:end)
13938f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13948f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13958f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13968f7e8f9dSMark Adams                       if (col==myk) {
13978f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13988f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13998f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14008f7e8f9dSMark Adams                       } else {
14018f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14028f7e8f9dSMark Adams                         PetscInt       high, low;
14038f7e8f9dSMark Adams                         const PetscInt *startj;
14048f7e8f9dSMark Adams                         if (col<myk) { // L
14058f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14068f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14078f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14088f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14098f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14108f7e8f9dSMark Adams                         } else {
14118f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14128f7e8f9dSMark Adams                           start = ba_d + idx;
14138f7e8f9dSMark Adams                           startj= bj_d + idx;
14148f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14158f7e8f9dSMark Adams                         }
14168f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14178f7e8f9dSMark Adams                         low  = 0;
14188f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14198f7e8f9dSMark Adams                         while (high-low > 5) {
14208f7e8f9dSMark Adams                           int t = (low+high)/2;
14218f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14228f7e8f9dSMark Adams                           else                 low  = t;
14238f7e8f9dSMark Adams                         }
14248f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14258f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14268f7e8f9dSMark Adams                         }
14278f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
142899551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
142999551766SMark Adams #endif
14308f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14318f7e8f9dSMark Adams                       }
14328f7e8f9dSMark Adams                     });
14338f7e8f9dSMark Adams               }
14348f7e8f9dSMark Adams             });
14358f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14368f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14378f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14388f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14398f7e8f9dSMark Adams       });
14408f7e8f9dSMark Adams     Kokkos::fence();
14418f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14428f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
14438f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14448f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14458f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14468f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14478f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14488f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14498f7e8f9dSMark Adams             if (i < end) {
14508f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14518f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14528f7e8f9dSMark Adams             }
14538f7e8f9dSMark Adams           });
14548f7e8f9dSMark Adams       });
14558f7e8f9dSMark Adams   }
14568f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
14578f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
14588f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
14598f7e8f9dSMark Adams 
14608f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
14618f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
14628f7e8f9dSMark Adams   if (b->inode.size) {
14638f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14648f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14658f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14668f7e8f9dSMark Adams   } else {
14678f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14688f7e8f9dSMark Adams   }
14698f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14708f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
14718f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14728f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14738f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14748f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14758f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14768f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14778f7e8f9dSMark Adams 
1478930e68a5SMark Adams   PetscFunctionReturn(0);
1479930e68a5SMark Adams }
1480930e68a5SMark Adams 
148186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1482930e68a5SMark Adams {
1483930e68a5SMark Adams   PetscErrorCode   ierr;
1484930e68a5SMark Adams 
1485930e68a5SMark Adams   PetscFunctionBegin;
1486930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
148786a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
148886a27549SJunchao Zhang   PetscFunctionReturn(0);
148986a27549SJunchao Zhang }
149086a27549SJunchao Zhang 
149186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
149286a27549SJunchao Zhang {
149386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
149486a27549SJunchao Zhang 
149586a27549SJunchao Zhang   PetscFunctionBegin;
149686a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
149786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
149886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
149986a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
150086a27549SJunchao Zhang   }
150186a27549SJunchao Zhang   PetscFunctionReturn(0);
150286a27549SJunchao Zhang }
150386a27549SJunchao Zhang 
150486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
150586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
150686a27549SJunchao Zhang {
150786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1508076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
150986a27549SJunchao Zhang 
151086a27549SJunchao Zhang   PetscFunctionBegin;
151186a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
151286a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1513076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
151486a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1515076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1516076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
151786a27549SJunchao Zhang 
151886a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1519076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1520076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1521076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
152286a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
152386a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
152486a27549SJunchao Zhang 
152586a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
152686a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
152786a27549SJunchao Zhang     */
1528076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1529076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
153086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
153186a27549SJunchao Zhang 
153286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
153386a27549SJunchao Zhang 
153486a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1535076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
153686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1537076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1538076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
153986a27549SJunchao Zhang 
154086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1541076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1542076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1543076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
154486a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
154586a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
154686a27549SJunchao Zhang 
154786a27549SJunchao Zhang     /* Sort indices. See comments above */
1548076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1549076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
155086a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
155186a27549SJunchao Zhang 
155286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
155386a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
155486a27549SJunchao Zhang   }
155586a27549SJunchao Zhang   PetscFunctionReturn(0);
155686a27549SJunchao Zhang }
155786a27549SJunchao Zhang 
155886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
155986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
156086a27549SJunchao Zhang {
156186a27549SJunchao Zhang   PetscErrorCode                 ierr;
156286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
156386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
156486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
156586a27549SJunchao Zhang 
156686a27549SJunchao Zhang   PetscFunctionBegin;
1567eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
156986a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1570ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
157186a27549SJunchao Zhang   /* Solve L tmpv = b */
15723f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
157386a27549SJunchao Zhang   /* Solve Ux = tmpv */
15743f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
157586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1576ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1577eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
157886a27549SJunchao Zhang   PetscFunctionReturn(0);
157986a27549SJunchao Zhang }
158086a27549SJunchao Zhang 
1581076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
158286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
158386a27549SJunchao Zhang {
158486a27549SJunchao Zhang   PetscErrorCode                 ierr;
158586a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
158686a27549SJunchao Zhang   PetscScalarKokkosView          xv;
158786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
158886a27549SJunchao Zhang 
158986a27549SJunchao Zhang   PetscFunctionBegin;
1590eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
159186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
159286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1593ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
159486a27549SJunchao Zhang   /* Solve U^T tmpv = b */
159586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
159686a27549SJunchao Zhang 
159786a27549SJunchao Zhang   /* Solve L^T x = tmpv */
159886a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
159986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1600ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1601eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
160286a27549SJunchao Zhang   PetscFunctionReturn(0);
160386a27549SJunchao Zhang }
160486a27549SJunchao Zhang 
160586a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
160686a27549SJunchao Zhang {
160786a27549SJunchao Zhang   PetscErrorCode                 ierr;
160886a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
160986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
161086a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
161186a27549SJunchao Zhang 
161286a27549SJunchao Zhang   PetscFunctionBegin;
1613eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
161486a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1615076ba34aSJunchao Zhang 
1616076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1617076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1618076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1619076ba34aSJunchao Zhang 
1620076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
162186a27549SJunchao Zhang 
162286a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
162386a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
162486a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
162586a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
162686a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
162786a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
162886a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
162986a27549SJunchao Zhang 
163086a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
163186a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
163286a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1633eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1634eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
163586a27549SJunchao Zhang   PetscFunctionReturn(0);
163686a27549SJunchao Zhang }
163786a27549SJunchao Zhang 
163886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
163986a27549SJunchao Zhang {
164086a27549SJunchao Zhang   PetscErrorCode                 ierr;
164186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
164286a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
164386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
164486a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
164586a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
164686a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
164786a27549SJunchao Zhang 
164886a27549SJunchao Zhang   PetscFunctionBegin;
164986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
165086a27549SJunchao Zhang   /* Rebuild factors */
165186a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
165286a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
165386a27549SJunchao Zhang 
165486a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
165586a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
165686a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
165786a27549SJunchao Zhang 
165886a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
165986a27549SJunchao Zhang 
166086a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
166186a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
166286a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
166386a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
166486a27549SJunchao Zhang 
166586a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1666076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1667076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1668076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
166986a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
167086a27549SJunchao Zhang 
167186a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
167286a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
167386a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
167486a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
167586a27549SJunchao Zhang 
167686a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
167786a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
167886a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
167986a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
168086a27549SJunchao Zhang  #else
168186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
168286a27549SJunchao Zhang  #endif
168386a27549SJunchao Zhang 
168486a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
168586a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
168686a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
168786a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
168886a27549SJunchao Zhang 
168986a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
169086a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169186a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
169286a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
169386a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
169486a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
169586a27549SJunchao Zhang 
169686a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
169786a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1698930e68a5SMark Adams   PetscFunctionReturn(0);
1699930e68a5SMark Adams }
1700930e68a5SMark Adams 
17018f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17028f7e8f9dSMark Adams {
17038f7e8f9dSMark Adams   PetscErrorCode   ierr;
17048f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17058f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1706930e68a5SMark Adams 
17078f7e8f9dSMark Adams   PetscFunctionBegin;
17088f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
17098f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17108f7e8f9dSMark Adams   // move B data into Kokkos
17118f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
17128f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
17138f7e8f9dSMark Adams   {
17148f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1715300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17168f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1717300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1718300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17198f7e8f9dSMark Adams     }
17208f7e8f9dSMark Adams   }
17218f7e8f9dSMark Adams   PetscFunctionReturn(0);
17228f7e8f9dSMark Adams }
17238f7e8f9dSMark Adams 
172486a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1725930e68a5SMark Adams {
1726930e68a5SMark Adams   PetscFunctionBegin;
1727930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1728930e68a5SMark Adams   PetscFunctionReturn(0);
1729930e68a5SMark Adams }
1730930e68a5SMark Adams 
17318f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17328f7e8f9dSMark Adams {
17338f7e8f9dSMark Adams   PetscFunctionBegin;
17348f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17358f7e8f9dSMark Adams   PetscFunctionReturn(0);
17368f7e8f9dSMark Adams }
17378f7e8f9dSMark Adams 
1738930e68a5SMark Adams /*MC
173986a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
174086a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1741930e68a5SMark Adams 
1742930e68a5SMark Adams   Level: beginner
1743930e68a5SMark Adams 
1744*3f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1745930e68a5SMark Adams M*/
174686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1747930e68a5SMark Adams {
1748930e68a5SMark Adams   PetscErrorCode ierr;
1749930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1750930e68a5SMark Adams 
1751930e68a5SMark Adams   PetscFunctionBegin;
1752930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1753930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1754930e68a5SMark Adams   (*B)->factortype = ftype;
17554ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1756930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1757930e68a5SMark Adams 
17588f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1759930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
176086a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
176186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
176286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
176386a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
176486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
176586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
176698921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1767930e68a5SMark Adams 
1768930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
176986a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1770930e68a5SMark Adams   PetscFunctionReturn(0);
1771930e68a5SMark Adams }
17728f7e8f9dSMark Adams 
17738f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17748f7e8f9dSMark Adams {
17758f7e8f9dSMark Adams   PetscErrorCode ierr;
17768f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17778f7e8f9dSMark Adams 
17788f7e8f9dSMark Adams   PetscFunctionBegin;
17798f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
17808f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
17818f7e8f9dSMark Adams   (*B)->factortype = ftype;
1782f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17834ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
17848f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17858f7e8f9dSMark Adams 
17868f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17878f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17888f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17898f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17908f7e8f9dSMark Adams 
17918f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17928f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17938f7e8f9dSMark Adams   PetscFunctionReturn(0);
17948f7e8f9dSMark Adams }
179586a27549SJunchao Zhang 
179686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
179786a27549SJunchao Zhang {
179886a27549SJunchao Zhang   PetscErrorCode ierr;
179986a27549SJunchao Zhang 
180086a27549SJunchao Zhang   PetscFunctionBegin;
180186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
180286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
180386a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
180486a27549SJunchao Zhang   PetscFunctionReturn(0);
180586a27549SJunchao Zhang }
180686a27549SJunchao Zhang 
1807076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1808076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1809076ba34aSJunchao Zhang {
1810076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1811076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1812076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1813076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1814076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1815076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1816076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1817076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1818076ba34aSJunchao Zhang 
1819076ba34aSJunchao Zhang   PetscFunctionBegin;
1820c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1821076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1822c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1823076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1824c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1825076ba34aSJunchao Zhang     }
1826076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1827076ba34aSJunchao Zhang   }
1828076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1829076ba34aSJunchao Zhang }
1830