xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c0aa6a63a7860d309a895cb4aa0f9e11e7859f3a)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
48c3ff71bSJunchao Zhang #include <petscsystypes.h>
58c3ff71bSJunchao Zhang #include <petscerror.h>
68c3ff71bSJunchao Zhang 
78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
8f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
6186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
7986a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
842328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   PetscFunctionReturn(0);
8786a27549SJunchao Zhang }
8886a27549SJunchao Zhang 
89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90f0cf5187SStefano Zampini {
91f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92f0cf5187SStefano Zampini 
93f0cf5187SStefano Zampini   PetscFunctionBegin;
94f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9586a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9686a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
99f0cf5187SStefano Zampini   PetscFunctionReturn(0);
100f0cf5187SStefano Zampini }
101f0cf5187SStefano Zampini 
102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103f0cf5187SStefano Zampini {
104076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105f0cf5187SStefano Zampini 
106f0cf5187SStefano Zampini   PetscFunctionBegin;
107076ba34aSJunchao Zhang   if (aijkok) {
108076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
109076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
110076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
111076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112076ba34aSJunchao Zhang   }
113076ba34aSJunchao Zhang   PetscFunctionReturn(0);
114076ba34aSJunchao Zhang }
115076ba34aSJunchao Zhang 
116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117076ba34aSJunchao Zhang {
118076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119076ba34aSJunchao Zhang 
120076ba34aSJunchao Zhang   PetscFunctionBegin;
121076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
122076ba34aSJunchao Zhang   PetscFunctionReturn(0);
123076ba34aSJunchao Zhang }
124076ba34aSJunchao Zhang 
125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126076ba34aSJunchao Zhang {
127076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128076ba34aSJunchao Zhang 
129076ba34aSJunchao Zhang   PetscFunctionBegin;
1302328674fSJunchao Zhang   if (aijkok) {
131076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
132076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1332328674fSJunchao Zhang   } else {
1342328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1352328674fSJunchao Zhang   }
136076ba34aSJunchao Zhang   PetscFunctionReturn(0);
137076ba34aSJunchao Zhang }
138076ba34aSJunchao Zhang 
139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140076ba34aSJunchao Zhang {
141076ba34aSJunchao Zhang   PetscFunctionBegin;
142076ba34aSJunchao Zhang   *array = NULL;
143076ba34aSJunchao Zhang   PetscFunctionReturn(0);
144076ba34aSJunchao Zhang }
145076ba34aSJunchao Zhang 
146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147076ba34aSJunchao Zhang {
148076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang   PetscFunctionBegin;
1512328674fSJunchao Zhang   if (aijkok) {
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1552328674fSJunchao Zhang   }
156076ba34aSJunchao Zhang   PetscFunctionReturn(0);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160076ba34aSJunchao Zhang {
161076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162076ba34aSJunchao Zhang 
163076ba34aSJunchao Zhang   PetscFunctionBegin;
1642328674fSJunchao Zhang   if (aijkok) {
165076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
166076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1672328674fSJunchao Zhang   }
168f0cf5187SStefano Zampini   PetscFunctionReturn(0);
169f0cf5187SStefano Zampini }
170f0cf5187SStefano Zampini 
171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173a587d139SMark {
174042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176a587d139SMark 
177a587d139SMark   PetscFunctionBegin;
178076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181a587d139SMark   PetscFunctionReturn(0);
182a587d139SMark }
183a587d139SMark 
184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186a587d139SMark {
187042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188a587d139SMark 
189a587d139SMark   PetscFunctionBegin;
190a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
191a587d139SMark     *d_mat = aijkok->device_mat_d.data();
192a587d139SMark   } else {
193a587d139SMark     PetscErrorCode   ierr;
194a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195a587d139SMark     *d_mat  = NULL;
196a587d139SMark   }
197a587d139SMark   PetscFunctionReturn(0);
198a587d139SMark }
199076ba34aSJunchao Zhang 
200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202152b3e56SJunchao Zhang {
203152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204152b3e56SJunchao Zhang 
205152b3e56SJunchao Zhang   PetscFunctionBegin;
206076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21286a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
213076ba34aSJunchao Zhang   }
214076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
215152b3e56SJunchao Zhang   PetscFunctionReturn(0);
216152b3e56SJunchao Zhang }
217152b3e56SJunchao Zhang 
218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220152b3e56SJunchao Zhang {
221152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
222152b3e56SJunchao Zhang 
223152b3e56SJunchao Zhang   PetscFunctionBegin;
224076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
225076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
226076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
227076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
228076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
229076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
230076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
231076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
232076ba34aSJunchao Zhang    #endif
23386a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
234076ba34aSJunchao Zhang   }
235076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
236152b3e56SJunchao Zhang   PetscFunctionReturn(0);
237152b3e56SJunchao Zhang }
238a587d139SMark 
2398c3ff71bSJunchao Zhang /* y = A x */
2408c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2418c3ff71bSJunchao Zhang {
2428c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2438c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
244152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
245152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2468c3ff71bSJunchao Zhang 
2478c3ff71bSJunchao Zhang   PetscFunctionBegin;
2488c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
249152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
250152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2518c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
252152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
253152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
254152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
255bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
256076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
257152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2588c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2598c3ff71bSJunchao Zhang }
2608c3ff71bSJunchao Zhang 
2618c3ff71bSJunchao Zhang /* y = A^T x */
2628c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2638c3ff71bSJunchao Zhang {
2648c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2658c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
266152b3e56SJunchao Zhang   const char                       *mode;
267152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
268152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
269076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2708c3ff71bSJunchao Zhang 
2718c3ff71bSJunchao Zhang   PetscFunctionBegin;
2728c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
273152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
275152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
276076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
277152b3e56SJunchao Zhang     mode = "N";
278152b3e56SJunchao Zhang   } else {
279076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
280076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
281152b3e56SJunchao Zhang     mode = "T";
282152b3e56SJunchao Zhang   }
283076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
284152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
285152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
286bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
287076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
2888c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2898c3ff71bSJunchao Zhang }
2908c3ff71bSJunchao Zhang 
2918c3ff71bSJunchao Zhang /* y = A^H x */
2928c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2938c3ff71bSJunchao Zhang {
2948c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2958c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
296152b3e56SJunchao Zhang   const char                       *mode;
297152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
298152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
299076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3008c3ff71bSJunchao Zhang 
3018c3ff71bSJunchao Zhang   PetscFunctionBegin;
3028c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
303152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
305152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
306076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
307152b3e56SJunchao Zhang     mode = "N";
308152b3e56SJunchao Zhang   } else {
309076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
310076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
311152b3e56SJunchao Zhang     mode = "C";
312152b3e56SJunchao Zhang   }
313076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
314152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
315152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
316bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
317076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3188c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3198c3ff71bSJunchao Zhang }
3208c3ff71bSJunchao Zhang 
3218c3ff71bSJunchao Zhang /* z = A x + y */
3228c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3238c3ff71bSJunchao Zhang {
3248c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3258c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
326152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
327152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3288c3ff71bSJunchao Zhang 
3298c3ff71bSJunchao Zhang   PetscFunctionBegin;
3308c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
331152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
332152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
333152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3348c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3358c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
336152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
337152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
340bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
341152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3428c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3438c3ff71bSJunchao Zhang }
3448c3ff71bSJunchao Zhang 
3458c3ff71bSJunchao Zhang /* z = A^T x + y */
3468c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3478c3ff71bSJunchao Zhang {
3488c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3498c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
350152b3e56SJunchao Zhang   const char                       *mode;
351152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
352152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
353076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3548c3ff71bSJunchao Zhang 
3558c3ff71bSJunchao Zhang   PetscFunctionBegin;
3568c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
357152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
358152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
359152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3608c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
361152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
362076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
363152b3e56SJunchao Zhang     mode = "N";
364152b3e56SJunchao Zhang   } else {
365076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
366076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
367152b3e56SJunchao Zhang     mode = "T";
368152b3e56SJunchao Zhang   }
369076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
370152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
371152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
372152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
373bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
374076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3758c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3768c3ff71bSJunchao Zhang }
3778c3ff71bSJunchao Zhang 
3788c3ff71bSJunchao Zhang /* z = A^H x + y */
3798c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3808c3ff71bSJunchao Zhang {
3818c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3828c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
383152b3e56SJunchao Zhang   const char                       *mode;
384152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
385152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
386076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3878c3ff71bSJunchao Zhang 
3888c3ff71bSJunchao Zhang   PetscFunctionBegin;
3898c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
390152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
391152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
392152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3938c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
394152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
395076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
396152b3e56SJunchao Zhang     mode = "N";
397152b3e56SJunchao Zhang   } else {
398076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
399076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
400152b3e56SJunchao Zhang     mode = "C";
401152b3e56SJunchao Zhang   }
402076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
403152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
404152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
405152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
406bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
407076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
408152b3e56SJunchao Zhang   PetscFunctionReturn(0);
409152b3e56SJunchao Zhang }
410152b3e56SJunchao Zhang 
411152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
412152b3e56SJunchao Zhang {
413152b3e56SJunchao Zhang   PetscErrorCode            ierr;
414152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
415152b3e56SJunchao Zhang 
416152b3e56SJunchao Zhang   PetscFunctionBegin;
417152b3e56SJunchao Zhang   switch (op) {
418152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
419152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
420152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
421152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
422152b3e56SJunchao Zhang       break;
423152b3e56SJunchao Zhang     default:
424152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
425152b3e56SJunchao Zhang       break;
426152b3e56SJunchao Zhang   }
4278c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4288c3ff71bSJunchao Zhang }
4298c3ff71bSJunchao Zhang 
430076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4313d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4328c3ff71bSJunchao Zhang {
4338c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
434076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4358c3ff71bSJunchao Zhang 
4368c3ff71bSJunchao Zhang   PetscFunctionBegin;
437a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
438076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
439076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4408c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
441076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
442076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
443076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
444076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
445076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
446076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
447076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
448076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
449076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
450076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
451076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4528c3ff71bSJunchao Zhang     }
453076ba34aSJunchao Zhang   }
4548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4558c3ff71bSJunchao Zhang }
4568c3ff71bSJunchao Zhang 
457076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
458076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
459076ba34aSJunchao Zhang  */
460076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4618c3ff71bSJunchao Zhang {
4628c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
463076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
464076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
465076ba34aSJunchao Zhang   Mat                   mat;
4668c3ff71bSJunchao Zhang 
4678c3ff71bSJunchao Zhang   PetscFunctionBegin;
468076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
469076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
470076ba34aSJunchao Zhang   mat  = *B;
471076ba34aSJunchao Zhang   if (A->assembled) {
472076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
473076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
474076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
475076ba34aSJunchao Zhang     /* Now copy values to B if needed */
476076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
477076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
478076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
479076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
480076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
481076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
482076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
483076ba34aSJunchao Zhang       }
484076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
485076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
486076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
487076ba34aSJunchao Zhang     }
488076ba34aSJunchao Zhang     mat->spptr = bkok;
489076ba34aSJunchao Zhang   }
490076ba34aSJunchao Zhang 
491076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
492076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
493076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
494076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
4958c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4968c3ff71bSJunchao Zhang }
4978c3ff71bSJunchao Zhang 
4980ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
4990ecb592aSJunchao Zhang {
5000ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5010ecb592aSJunchao Zhang   Mat               At;
5020ecb592aSJunchao Zhang   KokkosCsrMatrix   *internT,*csrmatT;
5030ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5040ecb592aSJunchao Zhang 
5050ecb592aSJunchao Zhang   PetscFunctionBegin;
5060ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5070ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
5080ecb592aSJunchao Zhang     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
5090ecb592aSJunchao Zhang     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
5100ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5110ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5120ecb592aSJunchao Zhang     else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5130ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5140ecb592aSJunchao Zhang     if ((*B)->assembled) {
5150ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5160ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5170ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5180ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5190ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5200ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5210ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5220ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5230ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5240ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5250ecb592aSJunchao Zhang   }
5260ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5270ecb592aSJunchao Zhang }
5280ecb592aSJunchao Zhang 
5298c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5308c3ff71bSJunchao Zhang {
5318c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
53286a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5338c3ff71bSJunchao Zhang 
5348c3ff71bSJunchao Zhang   PetscFunctionBegin;
53586a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
53686a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5378f7e8f9dSMark Adams     if (aijkok) {
5388f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
539a587d139SMark         delete aijkok->colmap_d;
540a587d139SMark         delete aijkok->i_uncompressed_d;
541a587d139SMark       }
5428f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5438f7e8f9dSMark Adams     }
5448c3ff71bSJunchao Zhang     delete aijkok;
54586a27549SJunchao Zhang   } else {
54686a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54786a27549SJunchao Zhang   }
548152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
549152b3e56SJunchao Zhang   A->spptr = NULL;
5508c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5528c3ff71bSJunchao Zhang }
5538c3ff71bSJunchao Zhang 
55486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
55586a27549SJunchao Zhang {
55686a27549SJunchao Zhang   PetscErrorCode ierr;
55786a27549SJunchao Zhang 
55886a27549SJunchao Zhang   PetscFunctionBegin;
55986a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
56086a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
56186a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
56286a27549SJunchao Zhang   PetscFunctionReturn(0);
56386a27549SJunchao Zhang }
56486a27549SJunchao Zhang 
565076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
566076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
567a3f881fbSStefano Zampini {
568076ba34aSJunchao Zhang   PetscErrorCode               ierr;
569076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
570076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
571076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
572076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
573076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
574076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
575a3f881fbSStefano Zampini 
576a3f881fbSStefano Zampini   PetscFunctionBegin;
577076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
578076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
579076ba34aSJunchao Zhang   PetscValidPointer(C,4);
580076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
581076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
582*c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
583076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
584076ba34aSJunchao Zhang 
585076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
586076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
587076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
588076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
589076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
590076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
591076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
592076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
593076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
594076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
595076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
596076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
597076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
598076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
599076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
600076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
601076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
602076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
603076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
604076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
605076ba34aSJunchao Zhang     ca = ca_dual.view_device();
606076ba34aSJunchao Zhang     ci = ci_dual.view_device();
607076ba34aSJunchao Zhang     cj = cj_dual.view_device();
608076ba34aSJunchao Zhang 
609076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
610076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
611076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
612076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
613076ba34aSJunchao Zhang 
614076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
615076ba34aSJunchao Zhang         ci(i) = coffset;
616076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
617076ba34aSJunchao Zhang       });
618076ba34aSJunchao Zhang 
619076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
620076ba34aSJunchao Zhang         if (k < alen) {
621076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
622076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
623076ba34aSJunchao Zhang         } else {
624076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
625076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
626076ba34aSJunchao Zhang         }
627076ba34aSJunchao Zhang       });
628076ba34aSJunchao Zhang     });
629076ba34aSJunchao Zhang     ca_dual.modify_device();
630076ba34aSJunchao Zhang     ci_dual.modify_device();
631076ba34aSJunchao Zhang     cj_dual.modify_device();
632076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
633076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
634076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
635076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
636076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
637076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
638076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
639076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
640076ba34aSJunchao Zhang 
641076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
642076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
643076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
644076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
645076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
646076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
647076ba34aSJunchao Zhang       });
648076ba34aSJunchao Zhang     });
649076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
650076ba34aSJunchao Zhang   }
651076ba34aSJunchao Zhang   PetscFunctionReturn(0);
652076ba34aSJunchao Zhang }
653076ba34aSJunchao Zhang 
654076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
655076ba34aSJunchao Zhang {
656076ba34aSJunchao Zhang   PetscFunctionBegin;
657076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
658a3f881fbSStefano Zampini   PetscFunctionReturn(0);
659a3f881fbSStefano Zampini }
660a3f881fbSStefano Zampini 
661a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
662a3f881fbSStefano Zampini {
663076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
664a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
665a3f881fbSStefano Zampini   Mat                            A,B;
666076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
667a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
668a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
669076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
670076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
671a3f881fbSStefano Zampini 
672a3f881fbSStefano Zampini   PetscFunctionBegin;
673a3f881fbSStefano Zampini   MatCheckProduct(C,1);
674a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
675076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
676076ba34aSJunchao Zhang 
677076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
678076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
679076ba34aSJunchao Zhang     PetscFunctionReturn(0);
680076ba34aSJunchao Zhang   }
681076ba34aSJunchao Zhang 
682076ba34aSJunchao Zhang   switch (product->type) {
683076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
684076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
685076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
686076ba34aSJunchao Zhang     default:
687076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
688076ba34aSJunchao Zhang   }
689076ba34aSJunchao Zhang 
690a3f881fbSStefano Zampini   A     = product->A;
691a3f881fbSStefano Zampini   B     = product->B;
692a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
693a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
694a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
695a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
696a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
697076ba34aSJunchao Zhang 
698076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
699076ba34aSJunchao Zhang 
700076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
701076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
702076ba34aSJunchao Zhang 
703076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
704076ba34aSJunchao Zhang   if (transA) {
705076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
706076ba34aSJunchao Zhang     transA = false;
707a3f881fbSStefano Zampini   }
708a3f881fbSStefano Zampini 
709076ba34aSJunchao Zhang   if (transB) {
710076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
711076ba34aSJunchao Zhang     transB = false;
712076ba34aSJunchao Zhang   }
713076ba34aSJunchao Zhang 
714076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
715076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
716076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
717a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
718a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
719*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
720a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
721*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
722a3f881fbSStefano Zampini   c->reallocs         = 0;
723076ba34aSJunchao Zhang   C->info.mallocs     = 0;
724a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
725a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
726a3f881fbSStefano Zampini   C->num_ass++;
727a3f881fbSStefano Zampini   PetscFunctionReturn(0);
728a3f881fbSStefano Zampini }
729a3f881fbSStefano Zampini 
730a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
731a3f881fbSStefano Zampini {
732a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
733076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
734076ba34aSJunchao Zhang   MatProductType                 ptype;
735076ba34aSJunchao Zhang   Mat                            A,B;
736076ba34aSJunchao Zhang   bool                           transA,transB;
737076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
738076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
739076ba34aSJunchao Zhang   MPI_Comm                       comm;
740076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
741a3f881fbSStefano Zampini 
742a3f881fbSStefano Zampini   PetscFunctionBegin;
743a3f881fbSStefano Zampini   MatCheckProduct(C,1);
744076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
745076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
746a3f881fbSStefano Zampini   A       = product->A;
747a3f881fbSStefano Zampini   B       = product->B;
748a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
749a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
750a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
751a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
752076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
753076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
754076ba34aSJunchao Zhang 
755a3f881fbSStefano Zampini   ptype   = product->type;
756a3f881fbSStefano Zampini   switch (ptype) {
757076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
758076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
759076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
760a3f881fbSStefano Zampini     default:
761076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
762a3f881fbSStefano Zampini   }
763a3f881fbSStefano Zampini 
764076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
765076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
766076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
767076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
768a3f881fbSStefano Zampini 
769076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
770076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
771076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
772076ba34aSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773076ba34aSJunchao Zhang     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
774076ba34aSJunchao Zhang     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
775076ba34aSJunchao Zhang   #endif
776076ba34aSJunchao Zhang #endif
777076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
778076ba34aSJunchao Zhang 
779076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
780076ba34aSJunchao Zhang   if (transA) {
781076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
782076ba34aSJunchao Zhang     transA = false;
783076ba34aSJunchao Zhang   }
784076ba34aSJunchao Zhang 
785076ba34aSJunchao Zhang   if (transB) {
786076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
787076ba34aSJunchao Zhang     transB = false;
788076ba34aSJunchao Zhang   }
789076ba34aSJunchao Zhang 
790076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
792076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
793076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
794076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
795076ba34aSJunchao Zhang   */
796076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
797076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
798076ba34aSJunchao Zhang 
799076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
800076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
801076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
802a3f881fbSStefano Zampini   PetscFunctionReturn(0);
803a3f881fbSStefano Zampini }
804a3f881fbSStefano Zampini 
805a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
806076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
807a3f881fbSStefano Zampini {
808a3f881fbSStefano Zampini   PetscErrorCode ierr;
809076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
810a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
811a3f881fbSStefano Zampini 
812a3f881fbSStefano Zampini   PetscFunctionBegin;
813a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
814a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
815a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
816a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
817a3f881fbSStefano Zampini   }
818a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
819a3f881fbSStefano Zampini     switch (product->type) {
820a3f881fbSStefano Zampini     case MATPRODUCT_AB:
821a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
822a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
823a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
824a3f881fbSStefano Zampini       break;
825a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
826a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
827a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
828a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
829a3f881fbSStefano Zampini       break;
830a3f881fbSStefano Zampini     default:
831a3f881fbSStefano Zampini       break;
832a3f881fbSStefano Zampini     }
833a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
834a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
835a3f881fbSStefano Zampini   }
836a3f881fbSStefano Zampini   PetscFunctionReturn(0);
837a3f881fbSStefano Zampini }
838a587d139SMark 
839f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
840f0cf5187SStefano Zampini {
841f0cf5187SStefano Zampini   PetscErrorCode   ierr;
842f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
843f0cf5187SStefano Zampini 
844f0cf5187SStefano Zampini   PetscFunctionBegin;
845f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
846f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
847076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
848076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
849f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
850076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
851f0cf5187SStefano Zampini   PetscFunctionReturn(0);
852f0cf5187SStefano Zampini }
853f0cf5187SStefano Zampini 
854a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
855a587d139SMark {
856a587d139SMark   PetscErrorCode   ierr;
857076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
858a587d139SMark 
859a587d139SMark   PetscFunctionBegin;
860076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8612328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
862076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
863076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8642328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8652328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8662328674fSJunchao Zhang   }
867a587d139SMark   PetscFunctionReturn(0);
868a587d139SMark }
869a587d139SMark 
870db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
871db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
872db78de30SJunchao Zhang {
873db78de30SJunchao Zhang   PetscErrorCode     ierr;
874db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
875db78de30SJunchao Zhang 
876db78de30SJunchao Zhang   PetscFunctionBegin;
877db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
878db78de30SJunchao Zhang   PetscValidPointer(kv,2);
879db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
880db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
881db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
882076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
883db78de30SJunchao Zhang   PetscFunctionReturn(0);
884db78de30SJunchao Zhang }
885db78de30SJunchao Zhang 
886db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
887db78de30SJunchao Zhang {
888db78de30SJunchao Zhang   PetscFunctionBegin;
889db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890db78de30SJunchao Zhang   PetscValidPointer(kv,2);
891db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892db78de30SJunchao Zhang   PetscFunctionReturn(0);
893db78de30SJunchao Zhang }
894db78de30SJunchao Zhang 
895db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
896db78de30SJunchao Zhang {
897db78de30SJunchao Zhang   PetscErrorCode     ierr;
898db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
899db78de30SJunchao Zhang 
900db78de30SJunchao Zhang   PetscFunctionBegin;
901db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902db78de30SJunchao Zhang   PetscValidPointer(kv,2);
903db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
905db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
906076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
907db78de30SJunchao Zhang   PetscFunctionReturn(0);
908db78de30SJunchao Zhang }
909db78de30SJunchao Zhang 
910db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
911db78de30SJunchao Zhang {
912db78de30SJunchao Zhang   PetscErrorCode     ierr;
913db78de30SJunchao Zhang 
914db78de30SJunchao Zhang   PetscFunctionBegin;
915db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
916db78de30SJunchao Zhang   PetscValidPointer(kv,2);
917db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
918076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
919db78de30SJunchao Zhang   PetscFunctionReturn(0);
920db78de30SJunchao Zhang }
921db78de30SJunchao Zhang 
922db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
923db78de30SJunchao Zhang {
924db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
925db78de30SJunchao Zhang 
926db78de30SJunchao Zhang   PetscFunctionBegin;
927db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928db78de30SJunchao Zhang   PetscValidPointer(kv,2);
929db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
932db78de30SJunchao Zhang   PetscFunctionReturn(0);
933db78de30SJunchao Zhang }
934db78de30SJunchao Zhang 
935db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
936db78de30SJunchao Zhang {
937db78de30SJunchao Zhang   PetscErrorCode     ierr;
938db78de30SJunchao Zhang 
939db78de30SJunchao Zhang   PetscFunctionBegin;
940db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
941db78de30SJunchao Zhang   PetscValidPointer(kv,2);
942db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
943076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
944db78de30SJunchao Zhang   PetscFunctionReturn(0);
945db78de30SJunchao Zhang }
946db78de30SJunchao Zhang 
947c17cf699SJunchao Zhang /* Computes Y += alpha X */
948c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
949a587d139SMark {
950a587d139SMark   PetscErrorCode             ierr;
951a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
952c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
953c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
954c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
955a587d139SMark 
956a587d139SMark   PetscFunctionBegin;
957c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
958c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
959c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
960c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
961db78de30SJunchao Zhang 
962c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
963c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
964a587d139SMark     PetscBool e;
965a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
966a587d139SMark     if (e) {
967a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
968c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
969a587d139SMark     }
970a587d139SMark   }
971db78de30SJunchao Zhang 
972c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
973c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
974c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
975c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
976c17cf699SJunchao Zhang   */
977c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
978c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
979c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
980c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
981c17cf699SJunchao Zhang 
982c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
983c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
984c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
985c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
986c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
987c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
988c17cf699SJunchao Zhang 
989c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
990c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
991c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
992c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
993c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
994c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
995c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
996c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
997c17cf699SJunchao Zhang             q++;
998a587d139SMark           } else {
999c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1000c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1001c17cf699SJunchao Zhang             */
1002c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1003a587d139SMark           }
1004c17cf699SJunchao Zhang         }
1005c17cf699SJunchao Zhang       });
1006c17cf699SJunchao Zhang     });
1007c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1008c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1009c17cf699SJunchao Zhang     Mat             Z;
1010c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1011c17cf699SJunchao Zhang     KernelHandle    kh;
1012c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1013c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1014c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1015c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1016c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1017c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1018c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1019c17cf699SJunchao Zhang   }
1020c17cf699SJunchao Zhang 
1021a587d139SMark   PetscFunctionReturn(0);
1022a587d139SMark }
1023a587d139SMark 
102486a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10258f7e8f9dSMark Adams {
10268f7e8f9dSMark Adams   PetscErrorCode ierr;
10278f7e8f9dSMark Adams 
10288f7e8f9dSMark Adams   PetscFunctionBegin;
10298f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
10308f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
10318f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10328f7e8f9dSMark Adams   PetscFunctionReturn(0);
10338f7e8f9dSMark Adams }
10348f7e8f9dSMark Adams 
10358c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
10368c3ff71bSJunchao Zhang {
1037076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1038076ba34aSJunchao Zhang 
10398c3ff71bSJunchao Zhang   PetscFunctionBegin;
1040076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
10416f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
10426f3d89d0SStefano Zampini 
10438c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
10448c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
10458c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1046a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1047f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1048a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1049076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
10508c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
10518c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
10528c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
10538c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
10548c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
10558c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1056076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
10570ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1058152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1059076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1060076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1061076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1062076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1063076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1064076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1065076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1066076ba34aSJunchao Zhang }
1067076ba34aSJunchao Zhang 
1068076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1069076ba34aSJunchao Zhang {
1070076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1071076ba34aSJunchao Zhang   Mat_SeqAIJ         *aseq;
1072076ba34aSJunchao Zhang   PetscInt           i,m,n;
1073076ba34aSJunchao Zhang 
1074076ba34aSJunchao Zhang   PetscFunctionBegin;
1075076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1076076ba34aSJunchao Zhang 
1077076ba34aSJunchao Zhang   m    = akok->nrows();
1078076ba34aSJunchao Zhang   n    = akok->ncols();
1079076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1080076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1081076ba34aSJunchao Zhang 
1082076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1083076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1084076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1085076ba34aSJunchao Zhang 
1086076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1087076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1088076ba34aSJunchao Zhang 
1089076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1090076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1091076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1092076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1093076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1094076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1095076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1096076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1097076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1098076ba34aSJunchao Zhang 
1099076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1100076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1101076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1102076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1103076ba34aSJunchao Zhang   }
1104076ba34aSJunchao Zhang 
1105076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1106076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1107076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1108076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1109076ba34aSJunchao Zhang   A->spptr = akok;
1110076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1111076ba34aSJunchao Zhang }
1112076ba34aSJunchao Zhang 
1113076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1114076ba34aSJunchao Zhang 
1115076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1116076ba34aSJunchao Zhang  */
1117076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1118076ba34aSJunchao Zhang {
1119076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1120076ba34aSJunchao Zhang 
1121076ba34aSJunchao Zhang   PetscFunctionBegin;
1122076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1123076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
11248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11258c3ff71bSJunchao Zhang }
11268c3ff71bSJunchao Zhang 
11278c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1128152b3e56SJunchao Zhang /*@C
11298c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
11308c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
11318c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
11328c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
11338c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
11348c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
11358c3ff71bSJunchao Zhang 
11368c3ff71bSJunchao Zhang    Collective
11378c3ff71bSJunchao Zhang 
11388c3ff71bSJunchao Zhang    Input Parameters:
11398c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
11408c3ff71bSJunchao Zhang .  m - number of rows
11418c3ff71bSJunchao Zhang .  n - number of columns
11428c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
11438c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
11448c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
11458c3ff71bSJunchao Zhang 
11468c3ff71bSJunchao Zhang    Output Parameter:
11478c3ff71bSJunchao Zhang .  A - the matrix
11488c3ff71bSJunchao Zhang 
11498c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
11508c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
11518c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
11528c3ff71bSJunchao Zhang 
11538c3ff71bSJunchao Zhang    Notes:
11548c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
11558c3ff71bSJunchao Zhang 
11568c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
11578c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
11588c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
11598c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
11608c3ff71bSJunchao Zhang 
11618c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
11628c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11638c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
11648c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
11658c3ff71bSJunchao Zhang 
11668c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
11678c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
11688c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
11698c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
11708c3ff71bSJunchao Zhang 
11718c3ff71bSJunchao Zhang    Level: intermediate
11728c3ff71bSJunchao Zhang 
1173076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
11748c3ff71bSJunchao Zhang @*/
11758c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11768c3ff71bSJunchao Zhang {
11778c3ff71bSJunchao Zhang   PetscErrorCode ierr;
11788c3ff71bSJunchao Zhang 
11798c3ff71bSJunchao Zhang   PetscFunctionBegin;
11808c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
11818c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11828c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11838c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11848c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11858c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11868c3ff71bSJunchao Zhang }
1187930e68a5SMark Adams 
11888f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
11898f7e8f9dSMark Adams //
11908f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
11918f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
11928f7e8f9dSMark Adams //
11938f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1194930e68a5SMark Adams {
11958f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
11968f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
11978f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1198930e68a5SMark Adams   PetscErrorCode     ierr;
11998f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1200076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1201076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1202076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12038f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
12048f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
12058f7e8f9dSMark Adams   PetscContainer     container;
1206930e68a5SMark Adams 
1207930e68a5SMark Adams   PetscFunctionBegin;
1208*c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12098f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
12108f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12118f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
12128f7e8f9dSMark Adams   if (container) {
12138f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
12148f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
12158f7e8f9dSMark Adams     Nf = (*pNf)%1000;
12168f7e8f9dSMark Adams     nv = (*pNf)/1000;
12178f7e8f9dSMark Adams     if (nv>0) nVec = nv;
12188f7e8f9dSMark Adams   } else Nf = 1;
1219*c0aa6a63SJacob Faibussowitsch   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
12208f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12218f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12228f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12238f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
12248f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12258f7e8f9dSMark Adams #endif
12268f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12278f7e8f9dSMark Adams   {
12288f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12298f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12308f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12318f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12328f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12338f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12348f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12358f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12368f7e8f9dSMark Adams     size_t flops_h = 0.0;
12378f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12388f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12398f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12408f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12418f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
12428f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
12438f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
12448f7e8f9dSMark Adams     // Fill A --> fact
12458f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1246042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
12478f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
12488f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
12498f7e8f9dSMark Adams         // zero rows of B
12508f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12518f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
12528f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
12538f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
12548f7e8f9dSMark Adams             /* zero (unfactored row) */
12558f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
12568f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
12578f7e8f9dSMark Adams           });
12588f7e8f9dSMark Adams         // copy A into B
12598f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12608f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
12618f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
12628f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
12638f7e8f9dSMark Adams             /* load in initial (unfactored row) */
12648f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
12658f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
12668f7e8f9dSMark Adams               PetscScalar vala = av[j];
12678f7e8f9dSMark Adams               if (colb == rowb) {
12688f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
12698f7e8f9dSMark Adams               } else {
12708f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12718f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12728f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
12738f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
12748f7e8f9dSMark Adams                   if (pbj[j] == colb) {
12758f7e8f9dSMark Adams                     pba[j] = vala;
12768f7e8f9dSMark Adams                     set++;
12778f7e8f9dSMark Adams                     break;
12788f7e8f9dSMark Adams                   }
12798f7e8f9dSMark Adams                 }
12808f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
12818f7e8f9dSMark Adams               }
12828f7e8f9dSMark Adams             }
12838f7e8f9dSMark Adams           });
12848f7e8f9dSMark Adams       });
12858f7e8f9dSMark Adams     Kokkos::fence();
1286930e68a5SMark Adams 
12878f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
12888f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12898f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12908f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1291042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
12928f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
12938f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
12948f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
12958f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
12968f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
12978f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
12988f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
12998f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13008f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13018f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13028f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13038f7e8f9dSMark Adams               else {
13048f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13058f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13068f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13078f7e8f9dSMark Adams                 size_t         st_idx;
13088f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13098f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13108f7e8f9dSMark Adams                 // get column, there has got to be a better way
13118f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13128f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13138f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13148f7e8f9dSMark Adams                       idx = j; // output
13158f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13168f7e8f9dSMark Adams                     }
13178f7e8f9dSMark Adams                 }, st_idx);
13188f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
131999551766SMark Adams #if defined(PETSC_USE_DEBUG)
132099551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
132199551766SMark Adams #endif
132299551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13238f7e8f9dSMark Adams                 // U(i+1,:end)
13248f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13258f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13268f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13278f7e8f9dSMark Adams                       if (col==myk) {
13288f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13298f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13308f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13318f7e8f9dSMark Adams                       } else {
13328f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13338f7e8f9dSMark Adams                         PetscInt       high, low;
13348f7e8f9dSMark Adams                         const PetscInt *startj;
13358f7e8f9dSMark Adams                         if (col<myk) { // L
13368f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13378f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13388f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13398f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
13408f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
13418f7e8f9dSMark Adams                         } else {
13428f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
13438f7e8f9dSMark Adams                           start = ba_d + idx;
13448f7e8f9dSMark Adams                           startj= bj_d + idx;
13458f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
13468f7e8f9dSMark Adams                         }
13478f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
13488f7e8f9dSMark Adams                         low  = 0;
13498f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
13508f7e8f9dSMark Adams                         while (high-low > 5) {
13518f7e8f9dSMark Adams                           int t = (low+high)/2;
13528f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
13538f7e8f9dSMark Adams                           else                 low  = t;
13548f7e8f9dSMark Adams                         }
13558f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
13568f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
13578f7e8f9dSMark Adams                         }
135899551766SMark Adams #if defined(PETSC_USE_DEBUG)
135999551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
136099551766SMark Adams #endif
13618f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
13628f7e8f9dSMark Adams                       }
13638f7e8f9dSMark Adams                     });
13648f7e8f9dSMark Adams               }
13658f7e8f9dSMark Adams             });
13668f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
13678f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
13688f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
13698f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
13708f7e8f9dSMark Adams       });
13718f7e8f9dSMark Adams     Kokkos::fence();
13728f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
13738f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13748f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13758f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
13768f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13778f7e8f9dSMark Adams #endif
13788f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
13798f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
13808f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
13818f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
13828f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
13838f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
13848f7e8f9dSMark Adams             if (i < end) {
13858f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
13868f7e8f9dSMark Adams               *pv = 1.0/(*pv);
13878f7e8f9dSMark Adams             }
13888f7e8f9dSMark Adams           });
13898f7e8f9dSMark Adams       });
13908f7e8f9dSMark Adams   }
13918f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13928f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
13938f7e8f9dSMark Adams #endif
13948f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
13958f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
13968f7e8f9dSMark Adams 
13978f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
13988f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
13998f7e8f9dSMark Adams   if (b->inode.size) {
14008f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14018f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14028f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14038f7e8f9dSMark Adams   } else {
14048f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14058f7e8f9dSMark Adams   }
14068f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14078f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
14088f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14098f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14108f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14118f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14128f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14138f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14148f7e8f9dSMark Adams 
1415930e68a5SMark Adams   PetscFunctionReturn(0);
1416930e68a5SMark Adams }
1417930e68a5SMark Adams 
141886a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1419930e68a5SMark Adams {
1420930e68a5SMark Adams   PetscErrorCode   ierr;
1421930e68a5SMark Adams 
1422930e68a5SMark Adams   PetscFunctionBegin;
1423930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
142486a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
142586a27549SJunchao Zhang   PetscFunctionReturn(0);
142686a27549SJunchao Zhang }
142786a27549SJunchao Zhang 
142886a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
142986a27549SJunchao Zhang {
143086a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
143186a27549SJunchao Zhang 
143286a27549SJunchao Zhang   PetscFunctionBegin;
143386a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
143486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
143586a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
143686a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
143786a27549SJunchao Zhang   }
143886a27549SJunchao Zhang   PetscFunctionReturn(0);
143986a27549SJunchao Zhang }
144086a27549SJunchao Zhang 
144186a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
144286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
144386a27549SJunchao Zhang {
144486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1445076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
144686a27549SJunchao Zhang 
144786a27549SJunchao Zhang   PetscFunctionBegin;
144886a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
144986a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1450076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
145186a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1452076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1453076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
145486a27549SJunchao Zhang 
145586a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1456076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1457076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1458076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
145986a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
146086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
146186a27549SJunchao Zhang 
146286a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
146386a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
146486a27549SJunchao Zhang     */
1465076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1466076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
146786a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
146886a27549SJunchao Zhang 
146986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
147086a27549SJunchao Zhang 
147186a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1472076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
147386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1474076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1475076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
147686a27549SJunchao Zhang 
147786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1478076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1479076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1480076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
148186a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
148286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
148386a27549SJunchao Zhang 
148486a27549SJunchao Zhang     /* Sort indices. See comments above */
1485076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1486076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
148786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
148886a27549SJunchao Zhang 
148986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
149086a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
149186a27549SJunchao Zhang   }
149286a27549SJunchao Zhang   PetscFunctionReturn(0);
149386a27549SJunchao Zhang }
149486a27549SJunchao Zhang 
149586a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
149686a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
149786a27549SJunchao Zhang {
149886a27549SJunchao Zhang   PetscErrorCode                 ierr;
149986a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
150086a27549SJunchao Zhang   PetscScalarKokkosView          xv;
150186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
150286a27549SJunchao Zhang 
150386a27549SJunchao Zhang   PetscFunctionBegin;
150486a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
150586a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
150686a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
150786a27549SJunchao Zhang   /* Solve L tmpv = b */
15083f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
150986a27549SJunchao Zhang   /* Solve Ux = tmpv */
15103f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
151186a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
151286a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
151386a27549SJunchao Zhang   PetscFunctionReturn(0);
151486a27549SJunchao Zhang }
151586a27549SJunchao Zhang 
1516076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
151786a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
151886a27549SJunchao Zhang {
151986a27549SJunchao Zhang   PetscErrorCode                 ierr;
152086a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
152186a27549SJunchao Zhang   PetscScalarKokkosView          xv;
152286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
152386a27549SJunchao Zhang 
152486a27549SJunchao Zhang   PetscFunctionBegin;
152586a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
152686a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
152786a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
152886a27549SJunchao Zhang   /* Solve U^T tmpv = b */
152986a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
153086a27549SJunchao Zhang 
153186a27549SJunchao Zhang   /* Solve L^T x = tmpv */
153286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
153386a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
153486a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
153586a27549SJunchao Zhang   PetscFunctionReturn(0);
153686a27549SJunchao Zhang }
153786a27549SJunchao Zhang 
153886a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
153986a27549SJunchao Zhang {
154086a27549SJunchao Zhang   PetscErrorCode                 ierr;
154186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
154286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
154386a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
154486a27549SJunchao Zhang 
154586a27549SJunchao Zhang   PetscFunctionBegin;
154686a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1547076ba34aSJunchao Zhang 
1548076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1549076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1550076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1551076ba34aSJunchao Zhang 
1552076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
155386a27549SJunchao Zhang 
155486a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
155586a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
155686a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
155786a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
155886a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
155986a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
156086a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
156186a27549SJunchao Zhang 
156286a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
156386a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
156486a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
156586a27549SJunchao Zhang   PetscFunctionReturn(0);
156686a27549SJunchao Zhang }
156786a27549SJunchao Zhang 
156886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
156986a27549SJunchao Zhang {
157086a27549SJunchao Zhang   PetscErrorCode                 ierr;
157186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
157286a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
157386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
157486a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
157586a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
157686a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
157786a27549SJunchao Zhang 
157886a27549SJunchao Zhang   PetscFunctionBegin;
157986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
158086a27549SJunchao Zhang   /* Rebuild factors */
158186a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
158286a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
158586a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
158686a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
158786a27549SJunchao Zhang 
158886a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
158986a27549SJunchao Zhang 
159086a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
159186a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
159286a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
159386a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
159486a27549SJunchao Zhang 
159586a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1596076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1597076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1598076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
159986a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
160086a27549SJunchao Zhang 
160186a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
160286a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
160386a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
160486a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
160586a27549SJunchao Zhang 
160686a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
160786a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
160886a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
160986a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
161086a27549SJunchao Zhang  #else
161186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
161286a27549SJunchao Zhang  #endif
161386a27549SJunchao Zhang 
161486a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
161586a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
161686a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
161786a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
161886a27549SJunchao Zhang 
161986a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
162086a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
162186a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
162286a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
162386a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
162486a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
162586a27549SJunchao Zhang 
162686a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
162786a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1628930e68a5SMark Adams   PetscFunctionReturn(0);
1629930e68a5SMark Adams }
1630930e68a5SMark Adams 
16318f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16328f7e8f9dSMark Adams {
16338f7e8f9dSMark Adams   PetscErrorCode   ierr;
16348f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16358f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1636930e68a5SMark Adams 
16378f7e8f9dSMark Adams   PetscFunctionBegin;
16388f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
16398f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16408f7e8f9dSMark Adams   // move B data into Kokkos
16418f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
16428f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
16438f7e8f9dSMark Adams   {
16448f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
16458f7e8f9dSMark Adams     if (!baijkok->diag_d) {
16468f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1647152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
16488f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
16498f7e8f9dSMark Adams     }
16508f7e8f9dSMark Adams   }
16518f7e8f9dSMark Adams   PetscFunctionReturn(0);
16528f7e8f9dSMark Adams }
16538f7e8f9dSMark Adams 
165486a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1655930e68a5SMark Adams {
1656930e68a5SMark Adams   PetscFunctionBegin;
1657930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1658930e68a5SMark Adams   PetscFunctionReturn(0);
1659930e68a5SMark Adams }
1660930e68a5SMark Adams 
16618f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
16628f7e8f9dSMark Adams {
16638f7e8f9dSMark Adams   PetscFunctionBegin;
16648f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
16658f7e8f9dSMark Adams   PetscFunctionReturn(0);
16668f7e8f9dSMark Adams }
16678f7e8f9dSMark Adams 
1668930e68a5SMark Adams /*MC
166986a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
167086a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1671930e68a5SMark Adams 
1672930e68a5SMark Adams   Level: beginner
1673930e68a5SMark Adams 
167486a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1675930e68a5SMark Adams M*/
167686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1677930e68a5SMark Adams {
1678930e68a5SMark Adams   PetscErrorCode ierr;
1679930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1680930e68a5SMark Adams 
1681930e68a5SMark Adams   PetscFunctionBegin;
1682930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1683930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1684930e68a5SMark Adams   (*B)->factortype = ftype;
16854ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1686930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1687930e68a5SMark Adams 
16888f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1689930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
169086a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
169186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
169286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
169386a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
169486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
169586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
169686a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1697930e68a5SMark Adams 
1698930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169986a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1700930e68a5SMark Adams   PetscFunctionReturn(0);
1701930e68a5SMark Adams }
17028f7e8f9dSMark Adams 
17038f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17048f7e8f9dSMark Adams {
17058f7e8f9dSMark Adams   PetscErrorCode ierr;
17068f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17078f7e8f9dSMark Adams 
17088f7e8f9dSMark Adams   PetscFunctionBegin;
17098f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
17108f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
17118f7e8f9dSMark Adams   (*B)->factortype = ftype;
1712f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17134ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
17148f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17158f7e8f9dSMark Adams 
17168f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17178f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17188f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17198f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17208f7e8f9dSMark Adams 
17218f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17228f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17238f7e8f9dSMark Adams   PetscFunctionReturn(0);
17248f7e8f9dSMark Adams }
172586a27549SJunchao Zhang 
172686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
172786a27549SJunchao Zhang {
172886a27549SJunchao Zhang   PetscErrorCode ierr;
172986a27549SJunchao Zhang 
173086a27549SJunchao Zhang   PetscFunctionBegin;
173186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
173286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
173386a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
173486a27549SJunchao Zhang   PetscFunctionReturn(0);
173586a27549SJunchao Zhang }
173686a27549SJunchao Zhang 
1737076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1738076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1739076ba34aSJunchao Zhang {
1740076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1741076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1742076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1743076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1744076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1745076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1746076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1747076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1748076ba34aSJunchao Zhang 
1749076ba34aSJunchao Zhang   PetscFunctionBegin;
1750*c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1751076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1752*c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1753076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1754*c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1755076ba34aSJunchao Zhang     }
1756076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1757076ba34aSJunchao Zhang   }
1758076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1759076ba34aSJunchao Zhang }
1760