xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision ad7ac7b24a5c292386611bcef5091e1f3cc58d34)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
612c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
622c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
632c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
842328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   PetscFunctionReturn(0);
8786a27549SJunchao Zhang }
8886a27549SJunchao Zhang 
89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90f0cf5187SStefano Zampini {
91f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92f0cf5187SStefano Zampini 
93f0cf5187SStefano Zampini   PetscFunctionBegin;
94f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9586a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
962c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
99f0cf5187SStefano Zampini   PetscFunctionReturn(0);
100f0cf5187SStefano Zampini }
101f0cf5187SStefano Zampini 
102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103f0cf5187SStefano Zampini {
104076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105f0cf5187SStefano Zampini 
106f0cf5187SStefano Zampini   PetscFunctionBegin;
107076ba34aSJunchao Zhang   if (aijkok) {
108076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
109076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
110076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
111076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112076ba34aSJunchao Zhang   }
113076ba34aSJunchao Zhang   PetscFunctionReturn(0);
114076ba34aSJunchao Zhang }
115076ba34aSJunchao Zhang 
116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117076ba34aSJunchao Zhang {
118076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119076ba34aSJunchao Zhang 
120076ba34aSJunchao Zhang   PetscFunctionBegin;
121076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
122076ba34aSJunchao Zhang   PetscFunctionReturn(0);
123076ba34aSJunchao Zhang }
124076ba34aSJunchao Zhang 
125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126076ba34aSJunchao Zhang {
127076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128076ba34aSJunchao Zhang 
129076ba34aSJunchao Zhang   PetscFunctionBegin;
1302328674fSJunchao Zhang   if (aijkok) {
131076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
132076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1332328674fSJunchao Zhang   } else {
1342328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1352328674fSJunchao Zhang   }
136076ba34aSJunchao Zhang   PetscFunctionReturn(0);
137076ba34aSJunchao Zhang }
138076ba34aSJunchao Zhang 
139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140076ba34aSJunchao Zhang {
141076ba34aSJunchao Zhang   PetscFunctionBegin;
142076ba34aSJunchao Zhang   *array = NULL;
143076ba34aSJunchao Zhang   PetscFunctionReturn(0);
144076ba34aSJunchao Zhang }
145076ba34aSJunchao Zhang 
146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147076ba34aSJunchao Zhang {
148076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang   PetscFunctionBegin;
1512328674fSJunchao Zhang   if (aijkok) {
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1552328674fSJunchao Zhang   }
156076ba34aSJunchao Zhang   PetscFunctionReturn(0);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160076ba34aSJunchao Zhang {
161076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162076ba34aSJunchao Zhang 
163076ba34aSJunchao Zhang   PetscFunctionBegin;
1642328674fSJunchao Zhang   if (aijkok) {
165076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
166076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1672328674fSJunchao Zhang   }
168f0cf5187SStefano Zampini   PetscFunctionReturn(0);
169f0cf5187SStefano Zampini }
170f0cf5187SStefano Zampini 
171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173a587d139SMark {
174042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176a587d139SMark 
177a587d139SMark   PetscFunctionBegin;
1782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181a587d139SMark   PetscFunctionReturn(0);
182a587d139SMark }
183a587d139SMark 
184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186a587d139SMark {
187042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188a587d139SMark 
189a587d139SMark   PetscFunctionBegin;
190a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
191a587d139SMark     *d_mat = aijkok->device_mat_d.data();
192a587d139SMark   } else {
193a587d139SMark     PetscErrorCode   ierr;
194a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195a587d139SMark     *d_mat  = NULL;
196a587d139SMark   }
197a587d139SMark   PetscFunctionReturn(0);
198a587d139SMark }
199076ba34aSJunchao Zhang 
200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202152b3e56SJunchao Zhang {
203152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204152b3e56SJunchao Zhang 
205152b3e56SJunchao Zhang   PetscFunctionBegin;
2062c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21286a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
213076ba34aSJunchao Zhang   }
214076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
215152b3e56SJunchao Zhang   PetscFunctionReturn(0);
216152b3e56SJunchao Zhang }
217152b3e56SJunchao Zhang 
218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220152b3e56SJunchao Zhang {
221eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
222152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223152b3e56SJunchao Zhang 
224152b3e56SJunchao Zhang   PetscFunctionBegin;
225eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2262c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
229076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
232076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
233076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234076ba34aSJunchao Zhang    #endif
23586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
236076ba34aSJunchao Zhang   }
237076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
238eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239152b3e56SJunchao Zhang   PetscFunctionReturn(0);
240152b3e56SJunchao Zhang }
241a587d139SMark 
2428c3ff71bSJunchao Zhang /* y = A x */
2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2448c3ff71bSJunchao Zhang {
2458c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2468c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
247152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
248152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang   PetscFunctionBegin;
2518c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253*ad7ac7b2SJunchao Zhang   ierr   = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
2548c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255eeadb341SJunchao Zhang   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258*ad7ac7b2SJunchao Zhang   ierr   = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
259076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2618c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2628c3ff71bSJunchao Zhang }
2638c3ff71bSJunchao Zhang 
2648c3ff71bSJunchao Zhang /* y = A^T x */
2658c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2668c3ff71bSJunchao Zhang {
2678c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2688c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
269152b3e56SJunchao Zhang   const char                       *mode;
270152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
271152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
272076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2738c3ff71bSJunchao Zhang 
2748c3ff71bSJunchao Zhang   PetscFunctionBegin;
275eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2768c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
279152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
280076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281152b3e56SJunchao Zhang     mode = "N";
282152b3e56SJunchao Zhang   } else {
283076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
285152b3e56SJunchao Zhang     mode = "T";
286152b3e56SJunchao Zhang   }
287076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
290076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2928c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2938c3ff71bSJunchao Zhang }
2948c3ff71bSJunchao Zhang 
2958c3ff71bSJunchao Zhang /* y = A^H x */
2968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2978c3ff71bSJunchao Zhang {
2988c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2998c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
300152b3e56SJunchao Zhang   const char                       *mode;
301152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
302152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
303076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3048c3ff71bSJunchao Zhang 
3058c3ff71bSJunchao Zhang   PetscFunctionBegin;
306eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
311076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312152b3e56SJunchao Zhang     mode = "N";
313152b3e56SJunchao Zhang   } else {
314076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
316152b3e56SJunchao Zhang     mode = "C";
317152b3e56SJunchao Zhang   }
318076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(yy,&yv);CHKERRQ(ierr);
321076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3248c3ff71bSJunchao Zhang }
3258c3ff71bSJunchao Zhang 
3268c3ff71bSJunchao Zhang /* z = A x + y */
3278c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3288c3ff71bSJunchao Zhang {
3298c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3308c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
331152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
332152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang   PetscFunctionBegin;
335eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3368c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3408c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3418c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3488c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3498c3ff71bSJunchao Zhang }
3508c3ff71bSJunchao Zhang 
3518c3ff71bSJunchao Zhang /* z = A^T x + y */
3528c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3538c3ff71bSJunchao Zhang {
3548c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3558c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
356152b3e56SJunchao Zhang   const char                       *mode;
357152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
358152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
359076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3608c3ff71bSJunchao Zhang 
3618c3ff71bSJunchao Zhang   PetscFunctionBegin;
362eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
3678c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
368152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
369076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370152b3e56SJunchao Zhang     mode = "N";
371152b3e56SJunchao Zhang   } else {
372076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
374152b3e56SJunchao Zhang     mode = "T";
375152b3e56SJunchao Zhang   }
376076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
380076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3838c3ff71bSJunchao Zhang }
3848c3ff71bSJunchao Zhang 
3858c3ff71bSJunchao Zhang /* z = A^H x + y */
3868c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3878c3ff71bSJunchao Zhang {
3888c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3898c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
390152b3e56SJunchao Zhang   const char                       *mode;
391152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
392152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
393076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang   PetscFunctionBegin;
396eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3978c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
4018c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
402152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
403076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404152b3e56SJunchao Zhang     mode = "N";
405152b3e56SJunchao Zhang   } else {
406076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
408152b3e56SJunchao Zhang     mode = "C";
409152b3e56SJunchao Zhang   }
410076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(zz,&zv);CHKERRQ(ierr);
414076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416152b3e56SJunchao Zhang   PetscFunctionReturn(0);
417152b3e56SJunchao Zhang }
418152b3e56SJunchao Zhang 
419152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420152b3e56SJunchao Zhang {
421152b3e56SJunchao Zhang   PetscErrorCode            ierr;
422152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423152b3e56SJunchao Zhang 
424152b3e56SJunchao Zhang   PetscFunctionBegin;
425152b3e56SJunchao Zhang   switch (op) {
426152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
427152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
430152b3e56SJunchao Zhang       break;
431152b3e56SJunchao Zhang     default:
432152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433152b3e56SJunchao Zhang       break;
434152b3e56SJunchao Zhang   }
4358c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4368c3ff71bSJunchao Zhang }
4378c3ff71bSJunchao Zhang 
438076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4393d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4408c3ff71bSJunchao Zhang {
4418c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
442076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4438c3ff71bSJunchao Zhang 
4448c3ff71bSJunchao Zhang   PetscFunctionBegin;
445a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4488c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4512c71b3e2SJacob Faibussowitsch     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
4582c71b3e2SJacob Faibussowitsch       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4608c3ff71bSJunchao Zhang     }
461076ba34aSJunchao Zhang   }
4628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4638c3ff71bSJunchao Zhang }
4648c3ff71bSJunchao Zhang 
465076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467076ba34aSJunchao Zhang  */
468076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4698c3ff71bSJunchao Zhang {
4708c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
471076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
472076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473076ba34aSJunchao Zhang   Mat                   mat;
4748c3ff71bSJunchao Zhang 
4758c3ff71bSJunchao Zhang   PetscFunctionBegin;
476076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478076ba34aSJunchao Zhang   mat  = *B;
479076ba34aSJunchao Zhang   if (A->assembled) {
480076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483076ba34aSJunchao Zhang     /* Now copy values to B if needed */
484076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
485076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
486076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
488076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
489076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
491076ba34aSJunchao Zhang       }
492076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
495076ba34aSJunchao Zhang     }
496076ba34aSJunchao Zhang     mat->spptr = bkok;
497076ba34aSJunchao Zhang   }
498076ba34aSJunchao Zhang 
499076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5048c3ff71bSJunchao Zhang }
5058c3ff71bSJunchao Zhang 
5060ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5070ecb592aSJunchao Zhang {
5080ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5090ecb592aSJunchao Zhang   Mat               At;
510ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5110ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5120ecb592aSJunchao Zhang 
5130ecb592aSJunchao Zhang   PetscFunctionBegin;
5140ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5150ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
516ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
517ff751488SJunchao Zhang     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5180ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5190ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5207a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5210ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5220ecb592aSJunchao Zhang     if ((*B)->assembled) {
5230ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5240ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5250ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5260ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5270ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5280ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5290ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5300ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5320ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5330ecb592aSJunchao Zhang   }
5340ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5350ecb592aSJunchao Zhang }
5360ecb592aSJunchao Zhang 
5378c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5388c3ff71bSJunchao Zhang {
5398c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5418c3ff71bSJunchao Zhang 
5428c3ff71bSJunchao Zhang   PetscFunctionBegin;
54386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5458c3ff71bSJunchao Zhang     delete aijkok;
54686a27549SJunchao Zhang   } else {
54786a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54886a27549SJunchao Zhang   }
549152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
55042550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
55142550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
552152b3e56SJunchao Zhang   A->spptr = NULL;
5538c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5558c3ff71bSJunchao Zhang }
5568c3ff71bSJunchao Zhang 
55786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
55886a27549SJunchao Zhang {
55986a27549SJunchao Zhang   PetscErrorCode ierr;
56086a27549SJunchao Zhang 
56186a27549SJunchao Zhang   PetscFunctionBegin;
56286a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
56386a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
56486a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
56586a27549SJunchao Zhang   PetscFunctionReturn(0);
56686a27549SJunchao Zhang }
56786a27549SJunchao Zhang 
568076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
569076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
570a3f881fbSStefano Zampini {
571076ba34aSJunchao Zhang   PetscErrorCode               ierr;
572076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
573076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
574076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
575076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
576076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
577076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
578a3f881fbSStefano Zampini 
579a3f881fbSStefano Zampini   PetscFunctionBegin;
580076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
581076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
582076ba34aSJunchao Zhang   PetscValidPointer(C,4);
583076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
584076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
5852c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5862c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
587076ba34aSJunchao Zhang 
588076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
589076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
590076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
591076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
592076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
593076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
594076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
595076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
596076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
597076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
598076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
599076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
600076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
601076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
602076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
603076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
604076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
605076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
606076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
607076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
608076ba34aSJunchao Zhang     ca = ca_dual.view_device();
609076ba34aSJunchao Zhang     ci = ci_dual.view_device();
610076ba34aSJunchao Zhang     cj = cj_dual.view_device();
611076ba34aSJunchao Zhang 
612076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
613076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
614076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
615076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
616076ba34aSJunchao Zhang 
617076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618076ba34aSJunchao Zhang         ci(i) = coffset;
619076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
620076ba34aSJunchao Zhang       });
621076ba34aSJunchao Zhang 
622076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
623076ba34aSJunchao Zhang         if (k < alen) {
624076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
625076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
626076ba34aSJunchao Zhang         } else {
627076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
628076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
629076ba34aSJunchao Zhang         }
630076ba34aSJunchao Zhang       });
631076ba34aSJunchao Zhang     });
632076ba34aSJunchao Zhang     ca_dual.modify_device();
633076ba34aSJunchao Zhang     ci_dual.modify_device();
634076ba34aSJunchao Zhang     cj_dual.modify_device();
635076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
636076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
637076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
638076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
639076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
640076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
641076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
642076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
643076ba34aSJunchao Zhang 
644076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
645076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
646076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
647076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
648076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
649076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
650076ba34aSJunchao Zhang       });
651076ba34aSJunchao Zhang     });
652076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
653076ba34aSJunchao Zhang   }
654076ba34aSJunchao Zhang   PetscFunctionReturn(0);
655076ba34aSJunchao Zhang }
656076ba34aSJunchao Zhang 
657076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
658076ba34aSJunchao Zhang {
659076ba34aSJunchao Zhang   PetscFunctionBegin;
660076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
661a3f881fbSStefano Zampini   PetscFunctionReturn(0);
662a3f881fbSStefano Zampini }
663a3f881fbSStefano Zampini 
664a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665a3f881fbSStefano Zampini {
666076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
667a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
668a3f881fbSStefano Zampini   Mat                            A,B;
669076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
670a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
671a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
672076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
673076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
674a3f881fbSStefano Zampini 
675a3f881fbSStefano Zampini   PetscFunctionBegin;
676a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6772c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
678076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
679076ba34aSJunchao Zhang 
680076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
681076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
682076ba34aSJunchao Zhang     PetscFunctionReturn(0);
683076ba34aSJunchao Zhang   }
684076ba34aSJunchao Zhang 
685076ba34aSJunchao Zhang   switch (product->type) {
686076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
687076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
688076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
689076ba34aSJunchao Zhang     default:
69098921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
691076ba34aSJunchao Zhang   }
692076ba34aSJunchao Zhang 
693a3f881fbSStefano Zampini   A     = product->A;
694a3f881fbSStefano Zampini   B     = product->B;
695a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
696a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
697a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
698a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
699a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
700076ba34aSJunchao Zhang 
7012c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
702076ba34aSJunchao Zhang 
703076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
704076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
705076ba34aSJunchao Zhang 
706076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
707076ba34aSJunchao Zhang   if (transA) {
708076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
709076ba34aSJunchao Zhang     transA = false;
710a3f881fbSStefano Zampini   }
711a3f881fbSStefano Zampini 
712076ba34aSJunchao Zhang   if (transB) {
713076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
714076ba34aSJunchao Zhang     transB = false;
715076ba34aSJunchao Zhang   }
716eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
717076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
718076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
719eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
720076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
721a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
722a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7237d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
724a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
7257d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
726a3f881fbSStefano Zampini   c->reallocs         = 0;
727076ba34aSJunchao Zhang   C->info.mallocs     = 0;
728a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
729a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
730a3f881fbSStefano Zampini   C->num_ass++;
731a3f881fbSStefano Zampini   PetscFunctionReturn(0);
732a3f881fbSStefano Zampini }
733a3f881fbSStefano Zampini 
734a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
735a3f881fbSStefano Zampini {
736a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
737076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
738076ba34aSJunchao Zhang   MatProductType                 ptype;
739076ba34aSJunchao Zhang   Mat                            A,B;
740076ba34aSJunchao Zhang   bool                           transA,transB;
741076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
742076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
743076ba34aSJunchao Zhang   MPI_Comm                       comm;
744076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
745a3f881fbSStefano Zampini 
746a3f881fbSStefano Zampini   PetscFunctionBegin;
747a3f881fbSStefano Zampini   MatCheckProduct(C,1);
748076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
7492c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
750a3f881fbSStefano Zampini   A       = product->A;
751a3f881fbSStefano Zampini   B       = product->B;
752a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
753a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
754a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
755a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
756076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
757076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
758076ba34aSJunchao Zhang 
759a3f881fbSStefano Zampini   ptype   = product->type;
760a3f881fbSStefano Zampini   switch (ptype) {
761076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
762076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
763076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
764a3f881fbSStefano Zampini     default:
76598921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
766a3f881fbSStefano Zampini   }
767a3f881fbSStefano Zampini 
768076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
769076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
770076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
771076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
772a3f881fbSStefano Zampini 
773076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
774076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7756ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7766ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7776ffb9508SJunchao Zhang    */
778076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
779076ba34aSJunchao Zhang 
780eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
781076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
782076ba34aSJunchao Zhang   if (transA) {
783076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
784076ba34aSJunchao Zhang     transA = false;
785076ba34aSJunchao Zhang   }
786076ba34aSJunchao Zhang 
787076ba34aSJunchao Zhang   if (transB) {
788076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
789076ba34aSJunchao Zhang     transB = false;
790076ba34aSJunchao Zhang   }
791076ba34aSJunchao Zhang 
792076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
793076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
794076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
795076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
796076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
797076ba34aSJunchao Zhang   */
798076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
799076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
800eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
801076ba34aSJunchao Zhang 
802076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
803076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
804076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
805a3f881fbSStefano Zampini   PetscFunctionReturn(0);
806a3f881fbSStefano Zampini }
807a3f881fbSStefano Zampini 
808a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
809076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
810a3f881fbSStefano Zampini {
811a3f881fbSStefano Zampini   PetscErrorCode ierr;
812076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
813a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
814a3f881fbSStefano Zampini 
815a3f881fbSStefano Zampini   PetscFunctionBegin;
816a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
817a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
818a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
819a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
820a3f881fbSStefano Zampini   }
821a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
822a3f881fbSStefano Zampini     switch (product->type) {
823a3f881fbSStefano Zampini     case MATPRODUCT_AB:
824a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
825a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
826a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
827a3f881fbSStefano Zampini       break;
828a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
829a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
830a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
831a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
832a3f881fbSStefano Zampini       break;
833a3f881fbSStefano Zampini     default:
834a3f881fbSStefano Zampini       break;
835a3f881fbSStefano Zampini     }
836a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
837a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
838a3f881fbSStefano Zampini   }
839a3f881fbSStefano Zampini   PetscFunctionReturn(0);
840a3f881fbSStefano Zampini }
841a587d139SMark 
842f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
843f0cf5187SStefano Zampini {
844f0cf5187SStefano Zampini   PetscErrorCode   ierr;
845f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
846f0cf5187SStefano Zampini 
847f0cf5187SStefano Zampini   PetscFunctionBegin;
848eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
849f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
850f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
851076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
852076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
853076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
854eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
855f0cf5187SStefano Zampini   PetscFunctionReturn(0);
856f0cf5187SStefano Zampini }
857f0cf5187SStefano Zampini 
858a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
859a587d139SMark {
860a587d139SMark   PetscErrorCode   ierr;
861076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
862a587d139SMark 
863a587d139SMark   PetscFunctionBegin;
864076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8652328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
866076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
867076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8682328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8692328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8702328674fSJunchao Zhang   }
871a587d139SMark   PetscFunctionReturn(0);
872a587d139SMark }
873a587d139SMark 
874db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
87542550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
876db78de30SJunchao Zhang {
877db78de30SJunchao Zhang   PetscErrorCode     ierr;
878db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
879db78de30SJunchao Zhang 
880db78de30SJunchao Zhang   PetscFunctionBegin;
881db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
882db78de30SJunchao Zhang   PetscValidPointer(kv,2);
883db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
884db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
885db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
886076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
887db78de30SJunchao Zhang   PetscFunctionReturn(0);
888db78de30SJunchao Zhang }
889db78de30SJunchao Zhang 
89042550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
891db78de30SJunchao Zhang {
892db78de30SJunchao Zhang   PetscFunctionBegin;
893db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
894db78de30SJunchao Zhang   PetscValidPointer(kv,2);
895db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
896db78de30SJunchao Zhang   PetscFunctionReturn(0);
897db78de30SJunchao Zhang }
898db78de30SJunchao Zhang 
89942550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
900db78de30SJunchao Zhang {
901db78de30SJunchao Zhang   PetscErrorCode     ierr;
902db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
903db78de30SJunchao Zhang 
904db78de30SJunchao Zhang   PetscFunctionBegin;
905db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
906db78de30SJunchao Zhang   PetscValidPointer(kv,2);
907db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
908db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
909db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
910076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
911db78de30SJunchao Zhang   PetscFunctionReturn(0);
912db78de30SJunchao Zhang }
913db78de30SJunchao Zhang 
91442550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
915db78de30SJunchao Zhang {
916db78de30SJunchao Zhang   PetscErrorCode     ierr;
917db78de30SJunchao Zhang 
918db78de30SJunchao Zhang   PetscFunctionBegin;
919db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
920db78de30SJunchao Zhang   PetscValidPointer(kv,2);
921db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
922076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
923db78de30SJunchao Zhang   PetscFunctionReturn(0);
924db78de30SJunchao Zhang }
925db78de30SJunchao Zhang 
92642550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
927db78de30SJunchao Zhang {
928db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
929db78de30SJunchao Zhang 
930db78de30SJunchao Zhang   PetscFunctionBegin;
931db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
932db78de30SJunchao Zhang   PetscValidPointer(kv,2);
933db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
934db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
935076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
936db78de30SJunchao Zhang   PetscFunctionReturn(0);
937db78de30SJunchao Zhang }
938db78de30SJunchao Zhang 
93942550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
940db78de30SJunchao Zhang {
941db78de30SJunchao Zhang   PetscErrorCode     ierr;
942db78de30SJunchao Zhang 
943db78de30SJunchao Zhang   PetscFunctionBegin;
944db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
945db78de30SJunchao Zhang   PetscValidPointer(kv,2);
946db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
947076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
948db78de30SJunchao Zhang   PetscFunctionReturn(0);
949db78de30SJunchao Zhang }
950db78de30SJunchao Zhang 
951c17cf699SJunchao Zhang /* Computes Y += alpha X */
952c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
953a587d139SMark {
954a587d139SMark   PetscErrorCode             ierr;
955a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
956c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
957c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
958c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
959a587d139SMark 
960a587d139SMark   PetscFunctionBegin;
961c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
962c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
963c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
964c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
965eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
966db78de30SJunchao Zhang 
967c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
968c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
969a587d139SMark     PetscBool e;
970a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
971a587d139SMark     if (e) {
972a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
973c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
974a587d139SMark     }
975a587d139SMark   }
976db78de30SJunchao Zhang 
977c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
978c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
979c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
980c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
981c17cf699SJunchao Zhang   */
982c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
983c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
984c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
985c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
986c17cf699SJunchao Zhang 
987c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
988c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
989c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
990c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
991c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
992c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
993c17cf699SJunchao Zhang 
994c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
995c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
996c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
997c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
998c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
999c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1000c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1001c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1002c17cf699SJunchao Zhang             q++;
1003a587d139SMark           } else {
1004c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1005c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1006c17cf699SJunchao Zhang             */
1007c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1008a587d139SMark           }
1009c17cf699SJunchao Zhang         }
1010c17cf699SJunchao Zhang       });
1011c17cf699SJunchao Zhang     });
1012c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1013c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1014c17cf699SJunchao Zhang     Mat             Z;
1015c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1016c17cf699SJunchao Zhang     KernelHandle    kh;
1017c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1018c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1019c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1020c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1021c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1022c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1023c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1024c17cf699SJunchao Zhang   }
1025eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1027a587d139SMark   PetscFunctionReturn(0);
1028a587d139SMark }
1029a587d139SMark 
1030394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
103142550becSJunchao Zhang {
103242550becSJunchao Zhang   PetscErrorCode            ierr;
1033394ed5ebSJunchao Zhang   Mat                       newmat;
103442550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
103542550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
103642550becSJunchao Zhang 
103742550becSJunchao Zhang   PetscFunctionBegin;
1038394ed5ebSJunchao Zhang   ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
1039394ed5ebSJunchao Zhang   ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
1040394ed5ebSJunchao Zhang   ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
1041394ed5ebSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
104242550becSJunchao Zhang   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
104342550becSJunchao Zhang   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
104442550becSJunchao Zhang   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1045394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
104642550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1047394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
104842550becSJunchao Zhang   PetscFunctionReturn(0);
104942550becSJunchao Zhang }
105042550becSJunchao Zhang 
105142550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
105242550becSJunchao Zhang {
105342550becSJunchao Zhang   PetscErrorCode              ierr;
105442550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
105542550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1056394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1057394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1058394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
105942550becSJunchao Zhang   MatScalarKokkosView         Aa;
106042550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
106142550becSJunchao Zhang   PetscMemType                memtype;
106242550becSJunchao Zhang 
106342550becSJunchao Zhang   PetscFunctionBegin;
1064394ed5ebSJunchao Zhang   PetscAssert(A->assembled,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Expected matrix to be already assembled in MatSetPreallocationCOO()");
106542550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
106642550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1067394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
106842550becSJunchao Zhang   } else {
1069394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
107042550becSJunchao Zhang   }
107142550becSJunchao Zhang 
1072394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {ierr = MatSeqAIJGetKokkosViewWrite(A,&Aa);CHKERRQ(ierr);} /* write matrix values */
1073394ed5ebSJunchao Zhang   else {ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr);} /* read & write matrix values */
107442550becSJunchao Zhang 
1075394ed5ebSJunchao Zhang   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {
1076b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
1077b6c38306SJunchao Zhang     for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k));
1078b6c38306SJunchao Zhang     Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum;
107942550becSJunchao Zhang   });
1080394ed5ebSJunchao Zhang 
1081394ed5ebSJunchao Zhang   if (imode == INSERT_VALUES) {ierr = MatSeqAIJRestoreKokkosViewWrite(A,&Aa);CHKERRQ(ierr);}
1082394ed5ebSJunchao Zhang   else {ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);}
108342550becSJunchao Zhang   PetscFunctionReturn(0);
108442550becSJunchao Zhang }
108542550becSJunchao Zhang 
108686a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10878f7e8f9dSMark Adams {
10888f7e8f9dSMark Adams   PetscErrorCode ierr;
10898f7e8f9dSMark Adams 
10908f7e8f9dSMark Adams   PetscFunctionBegin;
10918f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
10928f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
10938f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10948f7e8f9dSMark Adams   PetscFunctionReturn(0);
10958f7e8f9dSMark Adams }
10968f7e8f9dSMark Adams 
10978c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
10988c3ff71bSJunchao Zhang {
109942550becSJunchao Zhang   PetscErrorCode     ierr;
1100076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1101076ba34aSJunchao Zhang 
11028c3ff71bSJunchao Zhang   PetscFunctionBegin;
1103076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11046f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11056f3d89d0SStefano Zampini 
11068c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11078c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11088c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1109a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1110f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1111a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1112076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
11138c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
11148c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
11158c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
11168c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
11178c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
11188c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1119076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
11200ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1121152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1122076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1123076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1124076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1125076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1126076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1127076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
112842550becSJunchao Zhang 
112942550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
113042550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1131076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1132076ba34aSJunchao Zhang }
1133076ba34aSJunchao Zhang 
1134076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1135076ba34aSJunchao Zhang {
1136076ba34aSJunchao Zhang   PetscErrorCode ierr;
1137076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1138076ba34aSJunchao Zhang   PetscInt       i,m,n;
1139076ba34aSJunchao Zhang 
1140076ba34aSJunchao Zhang   PetscFunctionBegin;
11412c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1142076ba34aSJunchao Zhang 
1143076ba34aSJunchao Zhang   m    = akok->nrows();
1144076ba34aSJunchao Zhang   n    = akok->ncols();
1145076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1146076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1147076ba34aSJunchao Zhang 
1148076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1149076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1150076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1151076ba34aSJunchao Zhang 
1152076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1153076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1154076ba34aSJunchao Zhang 
1155076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1156076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1157076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1158076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1159076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1160076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1161076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1162076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1163076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1164076ba34aSJunchao Zhang 
1165076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1166076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1167076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1168076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1169076ba34aSJunchao Zhang   }
1170076ba34aSJunchao Zhang 
1171076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1172076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1173ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1174076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1175076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1176076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1177076ba34aSJunchao Zhang }
1178076ba34aSJunchao Zhang 
1179076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1180076ba34aSJunchao Zhang 
1181076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1182076ba34aSJunchao Zhang  */
1183076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1184076ba34aSJunchao Zhang {
1185076ba34aSJunchao Zhang   PetscErrorCode ierr;
1186076ba34aSJunchao Zhang 
1187076ba34aSJunchao Zhang   PetscFunctionBegin;
1188076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1189076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
11908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11918c3ff71bSJunchao Zhang }
11928c3ff71bSJunchao Zhang 
11938c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1194152b3e56SJunchao Zhang /*@C
11958c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
11968c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
11978c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
11988c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
11998c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12008c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12018c3ff71bSJunchao Zhang 
12028c3ff71bSJunchao Zhang    Collective
12038c3ff71bSJunchao Zhang 
12048c3ff71bSJunchao Zhang    Input Parameters:
12058c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12068c3ff71bSJunchao Zhang .  m - number of rows
12078c3ff71bSJunchao Zhang .  n - number of columns
12088c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12098c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12108c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12118c3ff71bSJunchao Zhang 
12128c3ff71bSJunchao Zhang    Output Parameter:
12138c3ff71bSJunchao Zhang .  A - the matrix
12148c3ff71bSJunchao Zhang 
12158c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
12168c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
12178c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
12188c3ff71bSJunchao Zhang 
12198c3ff71bSJunchao Zhang    Notes:
12208c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
12218c3ff71bSJunchao Zhang 
12228c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
12238c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
12248c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
12258c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
12268c3ff71bSJunchao Zhang 
12278c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
12288c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
12298c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
12308c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
12318c3ff71bSJunchao Zhang 
12328c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12338c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12348c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12358c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12368c3ff71bSJunchao Zhang 
12378c3ff71bSJunchao Zhang    Level: intermediate
12388c3ff71bSJunchao Zhang 
1239076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
12408c3ff71bSJunchao Zhang @*/
12418c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12428c3ff71bSJunchao Zhang {
12438c3ff71bSJunchao Zhang   PetscErrorCode ierr;
12448c3ff71bSJunchao Zhang 
12458c3ff71bSJunchao Zhang   PetscFunctionBegin;
12468c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
12478c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
12488c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
12498c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
12508c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
12518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12528c3ff71bSJunchao Zhang }
1253930e68a5SMark Adams 
12548f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12558f7e8f9dSMark Adams //
125646804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12578f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12588f7e8f9dSMark Adams //
12598f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1260930e68a5SMark Adams {
12618f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12628f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12638f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1264930e68a5SMark Adams   PetscErrorCode     ierr;
12658f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1266300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1267076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1268076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12698f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
127046804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1271930e68a5SMark Adams 
1272930e68a5SMark Adams   PetscFunctionBegin;
12732c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12748f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
12752c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12768f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12778f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12788f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12798f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12808f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12818f7e8f9dSMark Adams   {
12828f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12838f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12848f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12858f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12868f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12878f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12888f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12898f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12908f7e8f9dSMark Adams     size_t flops_h = 0.0;
12918f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12928f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12938f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12948f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12958f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
12968f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
12978f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
12988f7e8f9dSMark Adams     // Fill A --> fact
12998f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1300042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13018f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13028f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13038f7e8f9dSMark Adams         // zero rows of B
13048f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13058f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
13068f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
13078f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
13088f7e8f9dSMark Adams             /* zero (unfactored row) */
13098f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
13108f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
13118f7e8f9dSMark Adams           });
13128f7e8f9dSMark Adams         // copy A into B
13138f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
13148f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
13158f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
13168f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
13178f7e8f9dSMark Adams             /* load in initial (unfactored row) */
13188f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
13198f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
13208f7e8f9dSMark Adams               PetscScalar vala = av[j];
13218f7e8f9dSMark Adams               if (colb == rowb) {
13228f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
13238f7e8f9dSMark Adams               } else {
13248f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13258f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
13268f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
13278f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
13288f7e8f9dSMark Adams                   if (pbj[j] == colb) {
13298f7e8f9dSMark Adams                     pba[j] = vala;
13308f7e8f9dSMark Adams                     set++;
13318f7e8f9dSMark Adams                     break;
13328f7e8f9dSMark Adams                   }
13338f7e8f9dSMark Adams                 }
13348f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
13358f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13368f1da0b2SJunchao Zhang                #endif
13378f7e8f9dSMark Adams               }
13388f7e8f9dSMark Adams             }
13398f7e8f9dSMark Adams           });
13408f7e8f9dSMark Adams       });
13418f7e8f9dSMark Adams     Kokkos::fence();
1342930e68a5SMark Adams 
13438f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13448f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13458f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13468f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1347042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13488f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13498f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13508f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13518f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13528f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13538f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13548f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13558f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13568f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13578f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13588f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13598f7e8f9dSMark Adams               else {
13608f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13618f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13628f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13638f7e8f9dSMark Adams                 size_t         st_idx;
13648f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13658f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13668f7e8f9dSMark Adams                 // get column, there has got to be a better way
13678f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13688f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13698f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13708f7e8f9dSMark Adams                       idx = j; // output
13718f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13728f7e8f9dSMark Adams                     }
13738f7e8f9dSMark Adams                 }, st_idx);
13748f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13758f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
137699551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
137799551766SMark Adams #endif
137899551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13798f7e8f9dSMark Adams                 // U(i+1,:end)
13808f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13818f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13828f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13838f7e8f9dSMark Adams                       if (col==myk) {
13848f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13858f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13868f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13878f7e8f9dSMark Adams                       } else {
13888f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13898f7e8f9dSMark Adams                         PetscInt       high, low;
13908f7e8f9dSMark Adams                         const PetscInt *startj;
13918f7e8f9dSMark Adams                         if (col<myk) { // L
13928f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13938f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13948f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13958f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
13968f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
13978f7e8f9dSMark Adams                         } else {
13988f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
13998f7e8f9dSMark Adams                           start = ba_d + idx;
14008f7e8f9dSMark Adams                           startj= bj_d + idx;
14018f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14028f7e8f9dSMark Adams                         }
14038f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14048f7e8f9dSMark Adams                         low  = 0;
14058f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
14068f7e8f9dSMark Adams                         while (high-low > 5) {
14078f7e8f9dSMark Adams                           int t = (low+high)/2;
14088f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
14098f7e8f9dSMark Adams                           else                 low  = t;
14108f7e8f9dSMark Adams                         }
14118f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
14128f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
14138f7e8f9dSMark Adams                         }
14148f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
141599551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
141699551766SMark Adams #endif
14178f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
14188f7e8f9dSMark Adams                       }
14198f7e8f9dSMark Adams                     });
14208f7e8f9dSMark Adams               }
14218f7e8f9dSMark Adams             });
14228f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
14238f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
14248f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
14258f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
14268f7e8f9dSMark Adams       });
14278f7e8f9dSMark Adams     Kokkos::fence();
14288f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
14298f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
14308f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
14318f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
14328f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
14338f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
14348f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
14358f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14368f7e8f9dSMark Adams             if (i < end) {
14378f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14388f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14398f7e8f9dSMark Adams             }
14408f7e8f9dSMark Adams           });
14418f7e8f9dSMark Adams       });
14428f7e8f9dSMark Adams   }
14438f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
14448f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
14458f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
14468f7e8f9dSMark Adams 
14478f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
14488f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
14498f7e8f9dSMark Adams   if (b->inode.size) {
14508f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14518f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14528f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14538f7e8f9dSMark Adams   } else {
14548f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14558f7e8f9dSMark Adams   }
14568f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14578f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
14588f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14598f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14608f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14618f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14628f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14638f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14648f7e8f9dSMark Adams 
1465930e68a5SMark Adams   PetscFunctionReturn(0);
1466930e68a5SMark Adams }
1467930e68a5SMark Adams 
146886a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1469930e68a5SMark Adams {
1470930e68a5SMark Adams   PetscErrorCode   ierr;
1471930e68a5SMark Adams 
1472930e68a5SMark Adams   PetscFunctionBegin;
1473930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
147486a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
147586a27549SJunchao Zhang   PetscFunctionReturn(0);
147686a27549SJunchao Zhang }
147786a27549SJunchao Zhang 
147886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
147986a27549SJunchao Zhang {
148086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
148186a27549SJunchao Zhang 
148286a27549SJunchao Zhang   PetscFunctionBegin;
148386a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
148486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
148586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
148686a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
148786a27549SJunchao Zhang   }
148886a27549SJunchao Zhang   PetscFunctionReturn(0);
148986a27549SJunchao Zhang }
149086a27549SJunchao Zhang 
149186a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
149286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
149386a27549SJunchao Zhang {
149486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1495076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
149686a27549SJunchao Zhang 
149786a27549SJunchao Zhang   PetscFunctionBegin;
149886a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
149986a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1500076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
150186a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1502076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1503076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
150486a27549SJunchao Zhang 
150586a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1506076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1507076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1508076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
150986a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
151086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
151186a27549SJunchao Zhang 
151286a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
151386a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
151486a27549SJunchao Zhang     */
1515076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1516076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
151786a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
151886a27549SJunchao Zhang 
151986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
152086a27549SJunchao Zhang 
152186a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1522076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
152386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1524076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1525076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
152686a27549SJunchao Zhang 
152786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1528076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1529076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1530076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
153186a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
153286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
153386a27549SJunchao Zhang 
153486a27549SJunchao Zhang     /* Sort indices. See comments above */
1535076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1536076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
153786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
153886a27549SJunchao Zhang 
153986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
154086a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
154186a27549SJunchao Zhang   }
154286a27549SJunchao Zhang   PetscFunctionReturn(0);
154386a27549SJunchao Zhang }
154486a27549SJunchao Zhang 
154586a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
154686a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
154786a27549SJunchao Zhang {
154886a27549SJunchao Zhang   PetscErrorCode                 ierr;
154986a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
155086a27549SJunchao Zhang   PetscScalarKokkosView          xv;
155186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
155286a27549SJunchao Zhang 
155386a27549SJunchao Zhang   PetscFunctionBegin;
1554eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155586a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
155686a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1557*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
155886a27549SJunchao Zhang   /* Solve L tmpv = b */
15593f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
156086a27549SJunchao Zhang   /* Solve Ux = tmpv */
15613f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
156286a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1563*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1564eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
156586a27549SJunchao Zhang   PetscFunctionReturn(0);
156686a27549SJunchao Zhang }
156786a27549SJunchao Zhang 
1568076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
156986a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
157086a27549SJunchao Zhang {
157186a27549SJunchao Zhang   PetscErrorCode                 ierr;
157286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
157386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
157486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
157586a27549SJunchao Zhang 
157686a27549SJunchao Zhang   PetscFunctionBegin;
1577eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
157886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
157986a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1580*ad7ac7b2SJunchao Zhang   ierr = VecGetKokkosViewWrite(x,&xv);CHKERRQ(ierr);
158186a27549SJunchao Zhang   /* Solve U^T tmpv = b */
158286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang   /* Solve L^T x = tmpv */
158586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
158686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1587*ad7ac7b2SJunchao Zhang   ierr = VecRestoreKokkosViewWrite(x,&xv);CHKERRQ(ierr);
1588eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
158986a27549SJunchao Zhang   PetscFunctionReturn(0);
159086a27549SJunchao Zhang }
159186a27549SJunchao Zhang 
159286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
159386a27549SJunchao Zhang {
159486a27549SJunchao Zhang   PetscErrorCode                 ierr;
159586a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
159686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
159786a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
159886a27549SJunchao Zhang 
159986a27549SJunchao Zhang   PetscFunctionBegin;
1600eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
160186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1602076ba34aSJunchao Zhang 
1603076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1604076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1605076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1606076ba34aSJunchao Zhang 
1607076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
160886a27549SJunchao Zhang 
160986a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
161086a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
161186a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
161286a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
161386a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
161486a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
161586a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
161686a27549SJunchao Zhang 
161786a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
161886a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
161986a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1620eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1621eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
162286a27549SJunchao Zhang   PetscFunctionReturn(0);
162386a27549SJunchao Zhang }
162486a27549SJunchao Zhang 
162586a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
162686a27549SJunchao Zhang {
162786a27549SJunchao Zhang   PetscErrorCode                 ierr;
162886a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
162986a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
163086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
163186a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
163286a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
163386a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
163486a27549SJunchao Zhang 
163586a27549SJunchao Zhang   PetscFunctionBegin;
163686a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
163786a27549SJunchao Zhang   /* Rebuild factors */
163886a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
163986a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
164086a27549SJunchao Zhang 
164186a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
164286a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
164386a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
164486a27549SJunchao Zhang 
164586a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
164686a27549SJunchao Zhang 
164786a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
164886a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
164986a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
165086a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
165186a27549SJunchao Zhang 
165286a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1653076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1654076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1655076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
165686a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
165786a27549SJunchao Zhang 
165886a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
165986a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
166086a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
166186a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
166286a27549SJunchao Zhang 
166386a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
166486a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
166586a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
166686a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
166786a27549SJunchao Zhang  #else
166886a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
166986a27549SJunchao Zhang  #endif
167086a27549SJunchao Zhang 
167186a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
167286a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
167386a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
167486a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
167586a27549SJunchao Zhang 
167686a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
167786a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
167886a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
167986a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
168086a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
168186a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
168286a27549SJunchao Zhang 
168386a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
168486a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1685930e68a5SMark Adams   PetscFunctionReturn(0);
1686930e68a5SMark Adams }
1687930e68a5SMark Adams 
16888f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16898f7e8f9dSMark Adams {
16908f7e8f9dSMark Adams   PetscErrorCode   ierr;
16918f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16928f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1693930e68a5SMark Adams 
16948f7e8f9dSMark Adams   PetscFunctionBegin;
16958f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
16968f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16978f7e8f9dSMark Adams   // move B data into Kokkos
16988f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
16998f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
17008f7e8f9dSMark Adams   {
17018f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1702300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17038f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1704300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1705300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
17068f7e8f9dSMark Adams     }
17078f7e8f9dSMark Adams   }
17088f7e8f9dSMark Adams   PetscFunctionReturn(0);
17098f7e8f9dSMark Adams }
17108f7e8f9dSMark Adams 
171186a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1712930e68a5SMark Adams {
1713930e68a5SMark Adams   PetscFunctionBegin;
1714930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1715930e68a5SMark Adams   PetscFunctionReturn(0);
1716930e68a5SMark Adams }
1717930e68a5SMark Adams 
17188f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
17198f7e8f9dSMark Adams {
17208f7e8f9dSMark Adams   PetscFunctionBegin;
17218f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
17228f7e8f9dSMark Adams   PetscFunctionReturn(0);
17238f7e8f9dSMark Adams }
17248f7e8f9dSMark Adams 
1725930e68a5SMark Adams /*MC
172686a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
172786a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1728930e68a5SMark Adams 
1729930e68a5SMark Adams   Level: beginner
1730930e68a5SMark Adams 
173186a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1732930e68a5SMark Adams M*/
173386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1734930e68a5SMark Adams {
1735930e68a5SMark Adams   PetscErrorCode ierr;
1736930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1737930e68a5SMark Adams 
1738930e68a5SMark Adams   PetscFunctionBegin;
1739930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1740930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1741930e68a5SMark Adams   (*B)->factortype = ftype;
17424ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1743930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1744930e68a5SMark Adams 
17458f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1746930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
174786a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
174886a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
174986a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
175086a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
175186a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
175286a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
175398921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1754930e68a5SMark Adams 
1755930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
175686a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1757930e68a5SMark Adams   PetscFunctionReturn(0);
1758930e68a5SMark Adams }
17598f7e8f9dSMark Adams 
17608f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17618f7e8f9dSMark Adams {
17628f7e8f9dSMark Adams   PetscErrorCode ierr;
17638f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17648f7e8f9dSMark Adams 
17658f7e8f9dSMark Adams   PetscFunctionBegin;
17668f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
17678f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
17688f7e8f9dSMark Adams   (*B)->factortype = ftype;
1769f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17704ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
17718f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17728f7e8f9dSMark Adams 
17738f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17748f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17758f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17768f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17778f7e8f9dSMark Adams 
17788f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17798f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17808f7e8f9dSMark Adams   PetscFunctionReturn(0);
17818f7e8f9dSMark Adams }
178286a27549SJunchao Zhang 
178386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
178486a27549SJunchao Zhang {
178586a27549SJunchao Zhang   PetscErrorCode ierr;
178686a27549SJunchao Zhang 
178786a27549SJunchao Zhang   PetscFunctionBegin;
178886a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
178986a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
179086a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
179186a27549SJunchao Zhang   PetscFunctionReturn(0);
179286a27549SJunchao Zhang }
179386a27549SJunchao Zhang 
1794076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1795076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1796076ba34aSJunchao Zhang {
1797076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1798076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1799076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1800076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1801076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1802076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1803076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1804076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1805076ba34aSJunchao Zhang 
1806076ba34aSJunchao Zhang   PetscFunctionBegin;
1807c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1808076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1809c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1810076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1811c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1812076ba34aSJunchao Zhang     }
1813076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1814076ba34aSJunchao Zhang   }
1815076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1816076ba34aSJunchao Zhang }
1817