xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 42550bec5353b99bbf937a3bdb81525a3b04a032)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
4*42550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
19*42550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
208c3ff71bSJunchao Zhang 
218c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
228c3ff71bSJunchao Zhang 
23076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
26076ba34aSJunchao Zhang  */
278c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
288c3ff71bSJunchao Zhang {
298c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
30076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
31076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
328c3ff71bSJunchao Zhang 
338c3ff71bSJunchao Zhang   PetscFunctionBegin;
34076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
358c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36076ba34aSJunchao Zhang 
37076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39076ba34aSJunchao Zhang 
40076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
41076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
43076ba34aSJunchao Zhang   */
44076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45076ba34aSJunchao Zhang     delete aijkok;
46076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47076ba34aSJunchao Zhang     A->spptr = aijkok;
48076ba34aSJunchao Zhang   }
49076ba34aSJunchao Zhang 
50a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
51a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52a587d139SMark   }
538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
548c3ff71bSJunchao Zhang }
558c3ff71bSJunchao Zhang 
5686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
57076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
588c3ff71bSJunchao Zhang {
598c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
608c3ff71bSJunchao Zhang 
618c3ff71bSJunchao Zhang   PetscFunctionBegin;
6286a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
66076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
67076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6886a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
698c3ff71bSJunchao Zhang   }
708c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
718c3ff71bSJunchao Zhang }
728c3ff71bSJunchao Zhang 
73076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
74fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7586a27549SJunchao Zhang {
7686a27549SJunchao Zhang   PetscErrorCode   ierr;
7786a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7886a27549SJunchao Zhang 
7986a27549SJunchao Zhang   PetscFunctionBegin;
8086a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8186a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8286a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8386a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
852328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8786a27549SJunchao Zhang   PetscFunctionReturn(0);
8886a27549SJunchao Zhang }
8986a27549SJunchao Zhang 
90f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91f0cf5187SStefano Zampini {
92f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93f0cf5187SStefano Zampini 
94f0cf5187SStefano Zampini   PetscFunctionBegin;
95f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9686a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9786a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
100f0cf5187SStefano Zampini   PetscFunctionReturn(0);
101f0cf5187SStefano Zampini }
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104f0cf5187SStefano Zampini {
105076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106f0cf5187SStefano Zampini 
107f0cf5187SStefano Zampini   PetscFunctionBegin;
108076ba34aSJunchao Zhang   if (aijkok) {
109076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
110076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
111076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
112076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113076ba34aSJunchao Zhang   }
114076ba34aSJunchao Zhang   PetscFunctionReturn(0);
115076ba34aSJunchao Zhang }
116076ba34aSJunchao Zhang 
117076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118076ba34aSJunchao Zhang {
119076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120076ba34aSJunchao Zhang 
121076ba34aSJunchao Zhang   PetscFunctionBegin;
122076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
123076ba34aSJunchao Zhang   PetscFunctionReturn(0);
124076ba34aSJunchao Zhang }
125076ba34aSJunchao Zhang 
126076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127076ba34aSJunchao Zhang {
128076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129076ba34aSJunchao Zhang 
130076ba34aSJunchao Zhang   PetscFunctionBegin;
1312328674fSJunchao Zhang   if (aijkok) {
132076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
133076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1342328674fSJunchao Zhang   } else {
1352328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1362328674fSJunchao Zhang   }
137076ba34aSJunchao Zhang   PetscFunctionReturn(0);
138076ba34aSJunchao Zhang }
139076ba34aSJunchao Zhang 
140076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141076ba34aSJunchao Zhang {
142076ba34aSJunchao Zhang   PetscFunctionBegin;
143076ba34aSJunchao Zhang   *array = NULL;
144076ba34aSJunchao Zhang   PetscFunctionReturn(0);
145076ba34aSJunchao Zhang }
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148076ba34aSJunchao Zhang {
149076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150076ba34aSJunchao Zhang 
151076ba34aSJunchao Zhang   PetscFunctionBegin;
1522328674fSJunchao Zhang   if (aijkok) {
153076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1542328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1552328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1562328674fSJunchao Zhang   }
157076ba34aSJunchao Zhang   PetscFunctionReturn(0);
158076ba34aSJunchao Zhang }
159076ba34aSJunchao Zhang 
160076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161076ba34aSJunchao Zhang {
162076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163076ba34aSJunchao Zhang 
164076ba34aSJunchao Zhang   PetscFunctionBegin;
1652328674fSJunchao Zhang   if (aijkok) {
166076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
167076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1682328674fSJunchao Zhang   }
169f0cf5187SStefano Zampini   PetscFunctionReturn(0);
170f0cf5187SStefano Zampini }
171f0cf5187SStefano Zampini 
172a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174a587d139SMark {
175042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
179076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182a587d139SMark   PetscFunctionReturn(0);
183a587d139SMark }
184a587d139SMark 
185a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187a587d139SMark {
188042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189a587d139SMark 
190a587d139SMark   PetscFunctionBegin;
191a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
192a587d139SMark     *d_mat = aijkok->device_mat_d.data();
193a587d139SMark   } else {
194a587d139SMark     PetscErrorCode   ierr;
195a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196a587d139SMark     *d_mat  = NULL;
197a587d139SMark   }
198a587d139SMark   PetscFunctionReturn(0);
199a587d139SMark }
200076ba34aSJunchao Zhang 
201076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
202076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203152b3e56SJunchao Zhang {
204152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205152b3e56SJunchao Zhang 
206152b3e56SJunchao Zhang   PetscFunctionBegin;
207076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
211076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21386a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
214076ba34aSJunchao Zhang   }
215076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
216152b3e56SJunchao Zhang   PetscFunctionReturn(0);
217152b3e56SJunchao Zhang }
218152b3e56SJunchao Zhang 
219076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
220076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221152b3e56SJunchao Zhang {
222152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223152b3e56SJunchao Zhang 
224152b3e56SJunchao Zhang   PetscFunctionBegin;
225076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
226076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
227076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
228076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
229076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
230076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
231076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
232076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
233076ba34aSJunchao Zhang    #endif
23486a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
235076ba34aSJunchao Zhang   }
236076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
237152b3e56SJunchao Zhang   PetscFunctionReturn(0);
238152b3e56SJunchao Zhang }
239a587d139SMark 
2408c3ff71bSJunchao Zhang /* y = A x */
2418c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2428c3ff71bSJunchao Zhang {
2438c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2448c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
245152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
246152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2478c3ff71bSJunchao Zhang 
2488c3ff71bSJunchao Zhang   PetscFunctionBegin;
2498c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
250152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
251152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2528c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
253152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
254152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
255152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
256bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
257076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
258152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2598c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2608c3ff71bSJunchao Zhang }
2618c3ff71bSJunchao Zhang 
2628c3ff71bSJunchao Zhang /* y = A^T x */
2638c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2648c3ff71bSJunchao Zhang {
2658c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2668c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
267152b3e56SJunchao Zhang   const char                       *mode;
268152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
269152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
270076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2718c3ff71bSJunchao Zhang 
2728c3ff71bSJunchao Zhang   PetscFunctionBegin;
2738c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
275152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
276152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
277076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
278152b3e56SJunchao Zhang     mode = "N";
279152b3e56SJunchao Zhang   } else {
280076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
281076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
282152b3e56SJunchao Zhang     mode = "T";
283152b3e56SJunchao Zhang   }
284076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
285152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
286152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
287bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
288076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
2898c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2908c3ff71bSJunchao Zhang }
2918c3ff71bSJunchao Zhang 
2928c3ff71bSJunchao Zhang /* y = A^H x */
2938c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2948c3ff71bSJunchao Zhang {
2958c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2968c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
297152b3e56SJunchao Zhang   const char                       *mode;
298152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
299152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
300076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3018c3ff71bSJunchao Zhang 
3028c3ff71bSJunchao Zhang   PetscFunctionBegin;
3038c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
305152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
306152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
307076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
308152b3e56SJunchao Zhang     mode = "N";
309152b3e56SJunchao Zhang   } else {
310076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
311076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
312152b3e56SJunchao Zhang     mode = "C";
313152b3e56SJunchao Zhang   }
314076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
315152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
316152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
317bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
318076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3198c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3208c3ff71bSJunchao Zhang }
3218c3ff71bSJunchao Zhang 
3228c3ff71bSJunchao Zhang /* z = A x + y */
3238c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3248c3ff71bSJunchao Zhang {
3258c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3268c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
327152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
328152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3298c3ff71bSJunchao Zhang 
3308c3ff71bSJunchao Zhang   PetscFunctionBegin;
3318c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
332152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
333152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
334152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3358c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3368c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
337152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
338152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
340152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
341bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
342152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3438c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3448c3ff71bSJunchao Zhang }
3458c3ff71bSJunchao Zhang 
3468c3ff71bSJunchao Zhang /* z = A^T x + y */
3478c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3488c3ff71bSJunchao Zhang {
3498c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3508c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
351152b3e56SJunchao Zhang   const char                       *mode;
352152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
353152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
354076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3558c3ff71bSJunchao Zhang 
3568c3ff71bSJunchao Zhang   PetscFunctionBegin;
3578c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
358152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
359152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
360152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3618c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
362152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
363076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
364152b3e56SJunchao Zhang     mode = "N";
365152b3e56SJunchao Zhang   } else {
366076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
367076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
368152b3e56SJunchao Zhang     mode = "T";
369152b3e56SJunchao Zhang   }
370076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
371152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
372152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
373152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
374bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
375076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3768c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3778c3ff71bSJunchao Zhang }
3788c3ff71bSJunchao Zhang 
3798c3ff71bSJunchao Zhang /* z = A^H x + y */
3808c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3818c3ff71bSJunchao Zhang {
3828c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3838c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
384152b3e56SJunchao Zhang   const char                       *mode;
385152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
386152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
387076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3888c3ff71bSJunchao Zhang 
3898c3ff71bSJunchao Zhang   PetscFunctionBegin;
3908c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
391152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
392152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
393152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3948c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
395152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
396076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
397152b3e56SJunchao Zhang     mode = "N";
398152b3e56SJunchao Zhang   } else {
399076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
400076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
401152b3e56SJunchao Zhang     mode = "C";
402152b3e56SJunchao Zhang   }
403076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
404152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
405152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
406152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
407bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
408076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
409152b3e56SJunchao Zhang   PetscFunctionReturn(0);
410152b3e56SJunchao Zhang }
411152b3e56SJunchao Zhang 
412152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
413152b3e56SJunchao Zhang {
414152b3e56SJunchao Zhang   PetscErrorCode            ierr;
415152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
416152b3e56SJunchao Zhang 
417152b3e56SJunchao Zhang   PetscFunctionBegin;
418152b3e56SJunchao Zhang   switch (op) {
419152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
420152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
421152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
422152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
423152b3e56SJunchao Zhang       break;
424152b3e56SJunchao Zhang     default:
425152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
426152b3e56SJunchao Zhang       break;
427152b3e56SJunchao Zhang   }
4288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4298c3ff71bSJunchao Zhang }
4308c3ff71bSJunchao Zhang 
431076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4323d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4338c3ff71bSJunchao Zhang {
4348c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
435076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4368c3ff71bSJunchao Zhang 
4378c3ff71bSJunchao Zhang   PetscFunctionBegin;
438a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
439076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
440076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4418c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
442076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
443076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
444076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
445076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
446076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
447076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
448076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
449076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
450076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
451076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
452076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4538c3ff71bSJunchao Zhang     }
454076ba34aSJunchao Zhang   }
4558c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4568c3ff71bSJunchao Zhang }
4578c3ff71bSJunchao Zhang 
458076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
459076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
460076ba34aSJunchao Zhang  */
461076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4628c3ff71bSJunchao Zhang {
4638c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
464076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
465076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
466076ba34aSJunchao Zhang   Mat                   mat;
4678c3ff71bSJunchao Zhang 
4688c3ff71bSJunchao Zhang   PetscFunctionBegin;
469076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
470076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
471076ba34aSJunchao Zhang   mat  = *B;
472076ba34aSJunchao Zhang   if (A->assembled) {
473076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
474076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
475076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
476076ba34aSJunchao Zhang     /* Now copy values to B if needed */
477076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
478076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
479076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
480076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
481076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
482076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
483076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
484076ba34aSJunchao Zhang       }
485076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
486076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
487076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
488076ba34aSJunchao Zhang     }
489076ba34aSJunchao Zhang     mat->spptr = bkok;
490076ba34aSJunchao Zhang   }
491076ba34aSJunchao Zhang 
492076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
493076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
494076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
495076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
4968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4978c3ff71bSJunchao Zhang }
4988c3ff71bSJunchao Zhang 
4990ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5000ecb592aSJunchao Zhang {
5010ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5020ecb592aSJunchao Zhang   Mat               At;
5030ecb592aSJunchao Zhang   KokkosCsrMatrix   *internT,*csrmatT;
5040ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5050ecb592aSJunchao Zhang 
5060ecb592aSJunchao Zhang   PetscFunctionBegin;
5070ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5080ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
5090ecb592aSJunchao Zhang     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
5100ecb592aSJunchao Zhang     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
5110ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5120ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5137a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5140ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5150ecb592aSJunchao Zhang     if ((*B)->assembled) {
5160ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5170ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5180ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5190ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5200ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5210ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5220ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5230ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5240ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5250ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5260ecb592aSJunchao Zhang   }
5270ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5280ecb592aSJunchao Zhang }
5290ecb592aSJunchao Zhang 
5308c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5318c3ff71bSJunchao Zhang {
5328c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
53386a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5348c3ff71bSJunchao Zhang 
5358c3ff71bSJunchao Zhang   PetscFunctionBegin;
53686a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
53786a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5388f7e8f9dSMark Adams     if (aijkok) {
5398f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
540a587d139SMark         delete aijkok->colmap_d;
541a587d139SMark         delete aijkok->i_uncompressed_d;
542a587d139SMark       }
5438f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5448f7e8f9dSMark Adams     }
5458c3ff71bSJunchao Zhang     delete aijkok;
54686a27549SJunchao Zhang   } else {
54786a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54886a27549SJunchao Zhang   }
549152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
550*42550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
551*42550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
552152b3e56SJunchao Zhang   A->spptr = NULL;
5538c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5558c3ff71bSJunchao Zhang }
5568c3ff71bSJunchao Zhang 
55786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
55886a27549SJunchao Zhang {
55986a27549SJunchao Zhang   PetscErrorCode ierr;
56086a27549SJunchao Zhang 
56186a27549SJunchao Zhang   PetscFunctionBegin;
56286a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
56386a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
56486a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
56586a27549SJunchao Zhang   PetscFunctionReturn(0);
56686a27549SJunchao Zhang }
56786a27549SJunchao Zhang 
568076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
569076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
570a3f881fbSStefano Zampini {
571076ba34aSJunchao Zhang   PetscErrorCode               ierr;
572076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
573076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
574076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
575076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
576076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
577076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
578a3f881fbSStefano Zampini 
579a3f881fbSStefano Zampini   PetscFunctionBegin;
580076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
581076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
582076ba34aSJunchao Zhang   PetscValidPointer(C,4);
583076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
584076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
585c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
586076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
587076ba34aSJunchao Zhang 
588076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
589076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
590076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
591076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
592076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
593076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
594076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
595076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
596076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
597076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
598076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
599076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
600076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
601076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
602076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
603076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
604076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
605076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
606076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
607076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
608076ba34aSJunchao Zhang     ca = ca_dual.view_device();
609076ba34aSJunchao Zhang     ci = ci_dual.view_device();
610076ba34aSJunchao Zhang     cj = cj_dual.view_device();
611076ba34aSJunchao Zhang 
612076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
613076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
614076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
615076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
616076ba34aSJunchao Zhang 
617076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
618076ba34aSJunchao Zhang         ci(i) = coffset;
619076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
620076ba34aSJunchao Zhang       });
621076ba34aSJunchao Zhang 
622076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
623076ba34aSJunchao Zhang         if (k < alen) {
624076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
625076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
626076ba34aSJunchao Zhang         } else {
627076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
628076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
629076ba34aSJunchao Zhang         }
630076ba34aSJunchao Zhang       });
631076ba34aSJunchao Zhang     });
632076ba34aSJunchao Zhang     ca_dual.modify_device();
633076ba34aSJunchao Zhang     ci_dual.modify_device();
634076ba34aSJunchao Zhang     cj_dual.modify_device();
635076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
636076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
637076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
638076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
639076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
640076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
641076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
642076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
643076ba34aSJunchao Zhang 
644076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
645076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
646076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
647076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
648076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
649076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
650076ba34aSJunchao Zhang       });
651076ba34aSJunchao Zhang     });
652076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
653076ba34aSJunchao Zhang   }
654076ba34aSJunchao Zhang   PetscFunctionReturn(0);
655076ba34aSJunchao Zhang }
656076ba34aSJunchao Zhang 
657076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
658076ba34aSJunchao Zhang {
659076ba34aSJunchao Zhang   PetscFunctionBegin;
660076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
661a3f881fbSStefano Zampini   PetscFunctionReturn(0);
662a3f881fbSStefano Zampini }
663a3f881fbSStefano Zampini 
664a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
665a3f881fbSStefano Zampini {
666076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
667a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
668a3f881fbSStefano Zampini   Mat                            A,B;
669076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
670a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
671a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
672076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
673076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
674a3f881fbSStefano Zampini 
675a3f881fbSStefano Zampini   PetscFunctionBegin;
676a3f881fbSStefano Zampini   MatCheckProduct(C,1);
677a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
678076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
679076ba34aSJunchao Zhang 
680076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
681076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
682076ba34aSJunchao Zhang     PetscFunctionReturn(0);
683076ba34aSJunchao Zhang   }
684076ba34aSJunchao Zhang 
685076ba34aSJunchao Zhang   switch (product->type) {
686076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
687076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
688076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
689076ba34aSJunchao Zhang     default:
690076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
691076ba34aSJunchao Zhang   }
692076ba34aSJunchao Zhang 
693a3f881fbSStefano Zampini   A     = product->A;
694a3f881fbSStefano Zampini   B     = product->B;
695a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
696a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
697a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
698a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
699a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
700076ba34aSJunchao Zhang 
701076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
702076ba34aSJunchao Zhang 
703076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
704076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
705076ba34aSJunchao Zhang 
706076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
707076ba34aSJunchao Zhang   if (transA) {
708076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
709076ba34aSJunchao Zhang     transA = false;
710a3f881fbSStefano Zampini   }
711a3f881fbSStefano Zampini 
712076ba34aSJunchao Zhang   if (transB) {
713076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
714076ba34aSJunchao Zhang     transB = false;
715076ba34aSJunchao Zhang   }
716076ba34aSJunchao Zhang 
717076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
718076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
719076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
720a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
721a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
722c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
723a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
724c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
725a3f881fbSStefano Zampini   c->reallocs         = 0;
726076ba34aSJunchao Zhang   C->info.mallocs     = 0;
727a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
728a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
729a3f881fbSStefano Zampini   C->num_ass++;
730a3f881fbSStefano Zampini   PetscFunctionReturn(0);
731a3f881fbSStefano Zampini }
732a3f881fbSStefano Zampini 
733a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
734a3f881fbSStefano Zampini {
735a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
736076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
737076ba34aSJunchao Zhang   MatProductType                 ptype;
738076ba34aSJunchao Zhang   Mat                            A,B;
739076ba34aSJunchao Zhang   bool                           transA,transB;
740076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
741076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
742076ba34aSJunchao Zhang   MPI_Comm                       comm;
743076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
744a3f881fbSStefano Zampini 
745a3f881fbSStefano Zampini   PetscFunctionBegin;
746a3f881fbSStefano Zampini   MatCheckProduct(C,1);
747076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
748076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
749a3f881fbSStefano Zampini   A       = product->A;
750a3f881fbSStefano Zampini   B       = product->B;
751a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
752a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
753a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
754a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
755076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
756076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
757076ba34aSJunchao Zhang 
758a3f881fbSStefano Zampini   ptype   = product->type;
759a3f881fbSStefano Zampini   switch (ptype) {
760076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
761076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
762076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
763a3f881fbSStefano Zampini     default:
764076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
765a3f881fbSStefano Zampini   }
766a3f881fbSStefano Zampini 
767076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
768076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
769076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
770076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
771a3f881fbSStefano Zampini 
772076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
773076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7746ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7756ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7766ffb9508SJunchao Zhang    */
777076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
778076ba34aSJunchao Zhang 
779076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
780076ba34aSJunchao Zhang   if (transA) {
781076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
782076ba34aSJunchao Zhang     transA = false;
783076ba34aSJunchao Zhang   }
784076ba34aSJunchao Zhang 
785076ba34aSJunchao Zhang   if (transB) {
786076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
787076ba34aSJunchao Zhang     transB = false;
788076ba34aSJunchao Zhang   }
789076ba34aSJunchao Zhang 
790076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
792076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
793076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
794076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
795076ba34aSJunchao Zhang   */
796076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
797076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
798076ba34aSJunchao Zhang 
799076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
800076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
801076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
802a3f881fbSStefano Zampini   PetscFunctionReturn(0);
803a3f881fbSStefano Zampini }
804a3f881fbSStefano Zampini 
805a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
806076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
807a3f881fbSStefano Zampini {
808a3f881fbSStefano Zampini   PetscErrorCode ierr;
809076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
810a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
811a3f881fbSStefano Zampini 
812a3f881fbSStefano Zampini   PetscFunctionBegin;
813a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
814a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
815a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
816a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
817a3f881fbSStefano Zampini   }
818a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
819a3f881fbSStefano Zampini     switch (product->type) {
820a3f881fbSStefano Zampini     case MATPRODUCT_AB:
821a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
822a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
823a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
824a3f881fbSStefano Zampini       break;
825a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
826a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
827a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
828a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
829a3f881fbSStefano Zampini       break;
830a3f881fbSStefano Zampini     default:
831a3f881fbSStefano Zampini       break;
832a3f881fbSStefano Zampini     }
833a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
834a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
835a3f881fbSStefano Zampini   }
836a3f881fbSStefano Zampini   PetscFunctionReturn(0);
837a3f881fbSStefano Zampini }
838a587d139SMark 
839f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
840f0cf5187SStefano Zampini {
841f0cf5187SStefano Zampini   PetscErrorCode   ierr;
842f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
843f0cf5187SStefano Zampini 
844f0cf5187SStefano Zampini   PetscFunctionBegin;
845f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
846f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
847076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
848076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
849f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
850076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
851f0cf5187SStefano Zampini   PetscFunctionReturn(0);
852f0cf5187SStefano Zampini }
853f0cf5187SStefano Zampini 
854a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
855a587d139SMark {
856a587d139SMark   PetscErrorCode   ierr;
857076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
858a587d139SMark 
859a587d139SMark   PetscFunctionBegin;
860076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8612328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
862076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
863076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8642328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8652328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8662328674fSJunchao Zhang   }
867a587d139SMark   PetscFunctionReturn(0);
868a587d139SMark }
869a587d139SMark 
870db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
871*42550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
872db78de30SJunchao Zhang {
873db78de30SJunchao Zhang   PetscErrorCode     ierr;
874db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
875db78de30SJunchao Zhang 
876db78de30SJunchao Zhang   PetscFunctionBegin;
877db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
878db78de30SJunchao Zhang   PetscValidPointer(kv,2);
879db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
880db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
881db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
882076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
883db78de30SJunchao Zhang   PetscFunctionReturn(0);
884db78de30SJunchao Zhang }
885db78de30SJunchao Zhang 
886*42550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
887db78de30SJunchao Zhang {
888db78de30SJunchao Zhang   PetscFunctionBegin;
889db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890db78de30SJunchao Zhang   PetscValidPointer(kv,2);
891db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892db78de30SJunchao Zhang   PetscFunctionReturn(0);
893db78de30SJunchao Zhang }
894db78de30SJunchao Zhang 
895*42550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
896db78de30SJunchao Zhang {
897db78de30SJunchao Zhang   PetscErrorCode     ierr;
898db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
899db78de30SJunchao Zhang 
900db78de30SJunchao Zhang   PetscFunctionBegin;
901db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902db78de30SJunchao Zhang   PetscValidPointer(kv,2);
903db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
905db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
906076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
907db78de30SJunchao Zhang   PetscFunctionReturn(0);
908db78de30SJunchao Zhang }
909db78de30SJunchao Zhang 
910*42550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
911db78de30SJunchao Zhang {
912db78de30SJunchao Zhang   PetscErrorCode     ierr;
913db78de30SJunchao Zhang 
914db78de30SJunchao Zhang   PetscFunctionBegin;
915db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
916db78de30SJunchao Zhang   PetscValidPointer(kv,2);
917db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
918076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
919db78de30SJunchao Zhang   PetscFunctionReturn(0);
920db78de30SJunchao Zhang }
921db78de30SJunchao Zhang 
922*42550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
923db78de30SJunchao Zhang {
924db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
925db78de30SJunchao Zhang 
926db78de30SJunchao Zhang   PetscFunctionBegin;
927db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928db78de30SJunchao Zhang   PetscValidPointer(kv,2);
929db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
931076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
932db78de30SJunchao Zhang   PetscFunctionReturn(0);
933db78de30SJunchao Zhang }
934db78de30SJunchao Zhang 
935*42550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
936db78de30SJunchao Zhang {
937db78de30SJunchao Zhang   PetscErrorCode     ierr;
938db78de30SJunchao Zhang 
939db78de30SJunchao Zhang   PetscFunctionBegin;
940db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
941db78de30SJunchao Zhang   PetscValidPointer(kv,2);
942db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
943076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
944db78de30SJunchao Zhang   PetscFunctionReturn(0);
945db78de30SJunchao Zhang }
946db78de30SJunchao Zhang 
947c17cf699SJunchao Zhang /* Computes Y += alpha X */
948c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
949a587d139SMark {
950a587d139SMark   PetscErrorCode             ierr;
951a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
952c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
953c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
954c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
955a587d139SMark 
956a587d139SMark   PetscFunctionBegin;
957c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
958c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
959c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
960c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
961db78de30SJunchao Zhang 
962c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
963c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
964a587d139SMark     PetscBool e;
965a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
966a587d139SMark     if (e) {
967a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
968c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
969a587d139SMark     }
970a587d139SMark   }
971db78de30SJunchao Zhang 
972c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
973c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
974c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
975c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
976c17cf699SJunchao Zhang   */
977c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
978c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
979c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
980c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
981c17cf699SJunchao Zhang 
982c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
983c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
984c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
985c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
986c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
987c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
988c17cf699SJunchao Zhang 
989c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
990c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
991c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
992c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
993c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
994c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
995c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
996c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
997c17cf699SJunchao Zhang             q++;
998a587d139SMark           } else {
999c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1000c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1001c17cf699SJunchao Zhang             */
1002c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1003a587d139SMark           }
1004c17cf699SJunchao Zhang         }
1005c17cf699SJunchao Zhang       });
1006c17cf699SJunchao Zhang     });
1007c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1008c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1009c17cf699SJunchao Zhang     Mat             Z;
1010c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1011c17cf699SJunchao Zhang     KernelHandle    kh;
1012c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1013c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1014c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1015c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1016c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1017c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1018c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1019c17cf699SJunchao Zhang   }
1020c17cf699SJunchao Zhang 
1021a587d139SMark   PetscFunctionReturn(0);
1022a587d139SMark }
1023a587d139SMark 
1024*42550becSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1025*42550becSJunchao Zhang {
1026*42550becSJunchao Zhang   PetscErrorCode            ierr;
1027*42550becSJunchao Zhang   MPI_Comm                  comm;
1028*42550becSJunchao Zhang   PetscInt                  *i,*j,*perm,*jmap;
1029*42550becSJunchao Zhang   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
1030*42550becSJunchao Zhang   PetscBool                 has_repeats = PETSC_FALSE;
1031*42550becSJunchao Zhang   PetscInt                  *Ai,*Aj;
1032*42550becSJunchao Zhang   PetscScalar               *Aa;
1033*42550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
1034*42550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
1035*42550becSJunchao Zhang   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
1036*42550becSJunchao Zhang   Mat                       newmat;
1037*42550becSJunchao Zhang 
1038*42550becSJunchao Zhang   PetscFunctionBegin;
1039*42550becSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
1040*42550becSJunchao Zhang   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
1041*42550becSJunchao Zhang   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
1042*42550becSJunchao Zhang   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
1043*42550becSJunchao Zhang   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
1044*42550becSJunchao Zhang   perm = perm_h.data();
1045*42550becSJunchao Zhang   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
1046*42550becSJunchao Zhang     if (j[k] < 0) i[k] = -1;
1047*42550becSJunchao Zhang     perm[k] = k;
1048*42550becSJunchao Zhang   }
1049*42550becSJunchao Zhang 
1050*42550becSJunchao Zhang   /* Sort by row */
1051*42550becSJunchao Zhang   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
1052*42550becSJunchao Zhang   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
1053*42550becSJunchao Zhang   nneg = k;
1054*42550becSJunchao Zhang   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
1055*42550becSJunchao Zhang   nnz  = 0; /* Total number of unique nonzeros to be counted */
1056*42550becSJunchao Zhang   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
1057*42550becSJunchao Zhang   jmap++; /* Inc jmap by 1 for convinience */
1058*42550becSJunchao Zhang 
1059*42550becSJunchao Zhang   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
1060*42550becSJunchao Zhang   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
1061*42550becSJunchao Zhang 
1062*42550becSJunchao Zhang   /* In each row, sort by column, then unique column indices to get row length */
1063*42550becSJunchao Zhang   Ai++; /* Inc by 1 for convinience */
1064*42550becSJunchao Zhang   q = 0; /* q-th unique nonzero, with q starting from 0 */
1065*42550becSJunchao Zhang   while (k<n) {
1066*42550becSJunchao Zhang     row   = i[k];
1067*42550becSJunchao Zhang     start = k; /* [start,end) indices for this row */
1068*42550becSJunchao Zhang     while (k<n && i[k] == row) k++;
1069*42550becSJunchao Zhang     end   = k;
1070*42550becSJunchao Zhang     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
1071*42550becSJunchao Zhang     /* Find number of unique col entries in this row */
1072*42550becSJunchao Zhang     Aj[q]   = j[start]; /* Log the first nonzero in this row */
1073*42550becSJunchao Zhang     jmap[q] = 1; /* Number of repeats of this nozero entry */
1074*42550becSJunchao Zhang     Ai[row] = 1;
1075*42550becSJunchao Zhang     nnz++;
1076*42550becSJunchao Zhang 
1077*42550becSJunchao Zhang     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
1078*42550becSJunchao Zhang       if (j[p] != j[p-1]) { /* Meet a new nonzero */
1079*42550becSJunchao Zhang         q++;
1080*42550becSJunchao Zhang         jmap[q] = 1;
1081*42550becSJunchao Zhang         Aj[q]   = j[p];
1082*42550becSJunchao Zhang         Ai[row]++;
1083*42550becSJunchao Zhang         nnz++;
1084*42550becSJunchao Zhang       } else {
1085*42550becSJunchao Zhang         jmap[q]++;
1086*42550becSJunchao Zhang         has_repeats = PETSC_TRUE;
1087*42550becSJunchao Zhang       }
1088*42550becSJunchao Zhang     }
1089*42550becSJunchao Zhang     q++; /* Move to next row and thus next unique nonzero */
1090*42550becSJunchao Zhang   }
1091*42550becSJunchao Zhang   ierr = PetscFree2(i,j);CHKERRQ(ierr);
1092*42550becSJunchao Zhang 
1093*42550becSJunchao Zhang   Ai--; /* Back to the beginning of Ai[] */
1094*42550becSJunchao Zhang   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
1095*42550becSJunchao Zhang   jmap--; /* Back to the beginning of jmap[] */
1096*42550becSJunchao Zhang   jmap[0] = 0;
1097*42550becSJunchao Zhang   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
1098*42550becSJunchao Zhang     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
1099*42550becSJunchao Zhang     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
1100*42550becSJunchao Zhang   }
1101*42550becSJunchao Zhang 
1102*42550becSJunchao Zhang   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
1103*42550becSJunchao Zhang     PetscInt *Aj_new;
1104*42550becSJunchao Zhang     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
1105*42550becSJunchao Zhang     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
1106*42550becSJunchao Zhang     ierr = PetscFree(Aj);CHKERRQ(ierr);
1107*42550becSJunchao Zhang     Aj = Aj_new;
1108*42550becSJunchao Zhang   }
1109*42550becSJunchao Zhang 
1110*42550becSJunchao Zhang   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
1111*42550becSJunchao Zhang   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
1112*42550becSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
1113*42550becSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
1114*42550becSJunchao Zhang   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
1115*42550becSJunchao Zhang 
1116*42550becSJunchao Zhang   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
1117*42550becSJunchao Zhang   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
1118*42550becSJunchao Zhang   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
1119*42550becSJunchao Zhang 
1120*42550becSJunchao Zhang   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
1121*42550becSJunchao Zhang     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
1122*42550becSJunchao Zhang     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
1123*42550becSJunchao Zhang     perm_h = newperm_h;
1124*42550becSJunchao Zhang   }
1125*42550becSJunchao Zhang 
1126*42550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1127*42550becSJunchao Zhang   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
1128*42550becSJunchao Zhang   PetscFunctionReturn(0);
1129*42550becSJunchao Zhang }
1130*42550becSJunchao Zhang 
1131*42550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
1132*42550becSJunchao Zhang {
1133*42550becSJunchao Zhang   PetscErrorCode              ierr;
1134*42550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
1135*42550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1136*42550becSJunchao Zhang   PetscInt                    nz = aseq->nz;
1137*42550becSJunchao Zhang   const MatRowMapKokkosView&  jmap = akok->jmap_d;
1138*42550becSJunchao Zhang   const MatRowMapKokkosView&  perm = akok->perm_d;
1139*42550becSJunchao Zhang   MatScalarKokkosView         Aa;
1140*42550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
1141*42550becSJunchao Zhang   PetscMemType                memtype;
1142*42550becSJunchao Zhang 
1143*42550becSJunchao Zhang   PetscFunctionBegin;
1144*42550becSJunchao Zhang   if (!v) { /* NULL v means an all zero array */
1145*42550becSJunchao Zhang     ierr = MatZeroEntries(A);CHKERRQ(ierr);
1146*42550becSJunchao Zhang     PetscFunctionReturn(0);
1147*42550becSJunchao Zhang   }
1148*42550becSJunchao Zhang 
1149*42550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
1150*42550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1151*42550becSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
1152*42550becSJunchao Zhang   } else {
1153*42550becSJunchao Zhang     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
1154*42550becSJunchao Zhang   }
1155*42550becSJunchao Zhang 
1156*42550becSJunchao Zhang   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
1157*42550becSJunchao Zhang   if (imode == INSERT_VALUES) {
1158*42550becSJunchao Zhang     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1159*42550becSJunchao Zhang   }
1160*42550becSJunchao Zhang 
1161*42550becSJunchao Zhang   if (akok->coo_has_repeats) {
1162*42550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
1163*42550becSJunchao Zhang       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
1164*42550becSJunchao Zhang     });
1165*42550becSJunchao Zhang   } else {
1166*42550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
1167*42550becSJunchao Zhang   }
1168*42550becSJunchao Zhang   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
1169*42550becSJunchao Zhang   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1170*42550becSJunchao Zhang   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1171*42550becSJunchao Zhang   PetscFunctionReturn(0);
1172*42550becSJunchao Zhang }
1173*42550becSJunchao Zhang 
117486a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11758f7e8f9dSMark Adams {
11768f7e8f9dSMark Adams   PetscErrorCode ierr;
11778f7e8f9dSMark Adams 
11788f7e8f9dSMark Adams   PetscFunctionBegin;
11798f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
11808f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
11818f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11828f7e8f9dSMark Adams   PetscFunctionReturn(0);
11838f7e8f9dSMark Adams }
11848f7e8f9dSMark Adams 
11858c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11868c3ff71bSJunchao Zhang {
1187*42550becSJunchao Zhang   PetscErrorCode     ierr;
1188076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1189076ba34aSJunchao Zhang 
11908c3ff71bSJunchao Zhang   PetscFunctionBegin;
1191076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11926f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
11936f3d89d0SStefano Zampini 
11948c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
11958c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
11968c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1197a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1198f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1199a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1200076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
12018c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
12028c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
12038c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
12048c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
12058c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
12068c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1207076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
12080ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1209152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1210076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1211076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1212076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1213076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1214076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1215076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1216*42550becSJunchao Zhang 
1217*42550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
1218*42550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1219076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1220076ba34aSJunchao Zhang }
1221076ba34aSJunchao Zhang 
1222076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1223076ba34aSJunchao Zhang {
1224076ba34aSJunchao Zhang   PetscErrorCode ierr;
1225076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1226076ba34aSJunchao Zhang   PetscInt       i,m,n;
1227076ba34aSJunchao Zhang 
1228076ba34aSJunchao Zhang   PetscFunctionBegin;
1229076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1230076ba34aSJunchao Zhang 
1231076ba34aSJunchao Zhang   m    = akok->nrows();
1232076ba34aSJunchao Zhang   n    = akok->ncols();
1233076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1234076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1235076ba34aSJunchao Zhang 
1236076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1237076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1238076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1239076ba34aSJunchao Zhang 
1240076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1241076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1242076ba34aSJunchao Zhang 
1243076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1244076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1245076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1246076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1247076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1248076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1249076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1250076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1251076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1252076ba34aSJunchao Zhang 
1253076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1254076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1255076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1256076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1257076ba34aSJunchao Zhang   }
1258076ba34aSJunchao Zhang 
1259076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1260076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1261076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1262076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1263076ba34aSJunchao Zhang   A->spptr = akok;
1264076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1265076ba34aSJunchao Zhang }
1266076ba34aSJunchao Zhang 
1267076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1268076ba34aSJunchao Zhang 
1269076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1270076ba34aSJunchao Zhang  */
1271076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1272076ba34aSJunchao Zhang {
1273076ba34aSJunchao Zhang   PetscErrorCode ierr;
1274076ba34aSJunchao Zhang 
1275076ba34aSJunchao Zhang   PetscFunctionBegin;
1276076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1277076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
12788c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12798c3ff71bSJunchao Zhang }
12808c3ff71bSJunchao Zhang 
12818c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1282152b3e56SJunchao Zhang /*@C
12838c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12848c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12858c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12868c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12878c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12888c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12898c3ff71bSJunchao Zhang 
12908c3ff71bSJunchao Zhang    Collective
12918c3ff71bSJunchao Zhang 
12928c3ff71bSJunchao Zhang    Input Parameters:
12938c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
12948c3ff71bSJunchao Zhang .  m - number of rows
12958c3ff71bSJunchao Zhang .  n - number of columns
12968c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
12978c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
12988c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
12998c3ff71bSJunchao Zhang 
13008c3ff71bSJunchao Zhang    Output Parameter:
13018c3ff71bSJunchao Zhang .  A - the matrix
13028c3ff71bSJunchao Zhang 
13038c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
13048c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
13058c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
13068c3ff71bSJunchao Zhang 
13078c3ff71bSJunchao Zhang    Notes:
13088c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
13098c3ff71bSJunchao Zhang 
13108c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
13118c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
13128c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
13138c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
13148c3ff71bSJunchao Zhang 
13158c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
13168c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
13178c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
13188c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
13198c3ff71bSJunchao Zhang 
13208c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
13218c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
13228c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
13238c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
13248c3ff71bSJunchao Zhang 
13258c3ff71bSJunchao Zhang    Level: intermediate
13268c3ff71bSJunchao Zhang 
1327076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
13288c3ff71bSJunchao Zhang @*/
13298c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
13308c3ff71bSJunchao Zhang {
13318c3ff71bSJunchao Zhang   PetscErrorCode ierr;
13328c3ff71bSJunchao Zhang 
13338c3ff71bSJunchao Zhang   PetscFunctionBegin;
13348c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
13358c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
13368c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
13378c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
13388c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
13398c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13408c3ff71bSJunchao Zhang }
1341930e68a5SMark Adams 
13428f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
13438f7e8f9dSMark Adams //
13448f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
13458f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
13468f7e8f9dSMark Adams //
13478f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1348930e68a5SMark Adams {
13498f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
13508f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13518f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1352930e68a5SMark Adams   PetscErrorCode     ierr;
13538f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1354076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1355076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1356076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
13578f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
13588f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
13598f7e8f9dSMark Adams   PetscContainer     container;
1360930e68a5SMark Adams 
1361930e68a5SMark Adams   PetscFunctionBegin;
1362c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
13638f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
13648f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
13658f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
13668f7e8f9dSMark Adams   if (container) {
13678f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
13688f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
13698f7e8f9dSMark Adams     Nf = (*pNf)%1000;
13708f7e8f9dSMark Adams     nv = (*pNf)/1000;
13718f7e8f9dSMark Adams     if (nv>0) nVec = nv;
13728f7e8f9dSMark Adams   } else Nf = 1;
1373c0aa6a63SJacob Faibussowitsch   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
13748f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
13758f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
13768f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
13778f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13788f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
13798f7e8f9dSMark Adams #endif
13808f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
13818f7e8f9dSMark Adams   {
13828f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13838f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13848f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
13858f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13868f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13878f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13888f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13898f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13908f7e8f9dSMark Adams     size_t flops_h = 0.0;
13918f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13928f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13938f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13948f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13958f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13968f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13978f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13988f7e8f9dSMark Adams     // Fill A --> fact
13998f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1400042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
14018f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
14028f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
14038f7e8f9dSMark Adams         // zero rows of B
14048f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14058f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
14068f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
14078f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
14088f7e8f9dSMark Adams             /* zero (unfactored row) */
14098f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
14108f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
14118f7e8f9dSMark Adams           });
14128f7e8f9dSMark Adams         // copy A into B
14138f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14148f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
14158f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
14168f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
14178f7e8f9dSMark Adams             /* load in initial (unfactored row) */
14188f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
14198f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
14208f7e8f9dSMark Adams               PetscScalar vala = av[j];
14218f7e8f9dSMark Adams               if (colb == rowb) {
14228f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
14238f7e8f9dSMark Adams               } else {
14248f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14258f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14268f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
14278f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
14288f7e8f9dSMark Adams                   if (pbj[j] == colb) {
14298f7e8f9dSMark Adams                     pba[j] = vala;
14308f7e8f9dSMark Adams                     set++;
14318f7e8f9dSMark Adams                     break;
14328f7e8f9dSMark Adams                   }
14338f7e8f9dSMark Adams                 }
14348f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
14358f7e8f9dSMark Adams               }
14368f7e8f9dSMark Adams             }
14378f7e8f9dSMark Adams           });
14388f7e8f9dSMark Adams       });
14398f7e8f9dSMark Adams     Kokkos::fence();
1440930e68a5SMark Adams 
14418f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
14428f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14438f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14448f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1445042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
14468f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
14478f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
14488f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
14498f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
14508f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
14518f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
14528f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
14538f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
14548f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
14558f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
14568f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
14578f7e8f9dSMark Adams               else {
14588f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
14598f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
14608f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
14618f7e8f9dSMark Adams                 size_t         st_idx;
14628f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
14638f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14648f7e8f9dSMark Adams                 // get column, there has got to be a better way
14658f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
14668f7e8f9dSMark Adams                     if (pjL[j] == ii) {
14678f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
14688f7e8f9dSMark Adams                       idx = j; // output
14698f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14708f7e8f9dSMark Adams                     }
14718f7e8f9dSMark Adams                 }, st_idx);
14728f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
147399551766SMark Adams #if defined(PETSC_USE_DEBUG)
147499551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
147599551766SMark Adams #endif
147699551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14778f7e8f9dSMark Adams                 // U(i+1,:end)
14788f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
14798f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
14808f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
14818f7e8f9dSMark Adams                       if (col==myk) {
14828f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
14838f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14848f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14858f7e8f9dSMark Adams                       } else {
14868f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14878f7e8f9dSMark Adams                         PetscInt       high, low;
14888f7e8f9dSMark Adams                         const PetscInt *startj;
14898f7e8f9dSMark Adams                         if (col<myk) { // L
14908f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14918f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14928f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14938f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14948f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14958f7e8f9dSMark Adams                         } else {
14968f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14978f7e8f9dSMark Adams                           start = ba_d + idx;
14988f7e8f9dSMark Adams                           startj= bj_d + idx;
14998f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
15008f7e8f9dSMark Adams                         }
15018f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
15028f7e8f9dSMark Adams                         low  = 0;
15038f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
15048f7e8f9dSMark Adams                         while (high-low > 5) {
15058f7e8f9dSMark Adams                           int t = (low+high)/2;
15068f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
15078f7e8f9dSMark Adams                           else                 low  = t;
15088f7e8f9dSMark Adams                         }
15098f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
15108f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
15118f7e8f9dSMark Adams                         }
151299551766SMark Adams #if defined(PETSC_USE_DEBUG)
151399551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
151499551766SMark Adams #endif
15158f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
15168f7e8f9dSMark Adams                       }
15178f7e8f9dSMark Adams                     });
15188f7e8f9dSMark Adams               }
15198f7e8f9dSMark Adams             });
15208f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
15218f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
15228f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
15238f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
15248f7e8f9dSMark Adams       });
15258f7e8f9dSMark Adams     Kokkos::fence();
15268f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
15278f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
15288f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
15298f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
15308f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
15318f7e8f9dSMark Adams #endif
15328f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
15338f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
15348f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
15358f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
15368f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
15378f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
15388f7e8f9dSMark Adams             if (i < end) {
15398f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
15408f7e8f9dSMark Adams               *pv = 1.0/(*pv);
15418f7e8f9dSMark Adams             }
15428f7e8f9dSMark Adams           });
15438f7e8f9dSMark Adams       });
15448f7e8f9dSMark Adams   }
15458f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
15468f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
15478f7e8f9dSMark Adams #endif
15488f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
15498f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
15508f7e8f9dSMark Adams 
15518f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
15528f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
15538f7e8f9dSMark Adams   if (b->inode.size) {
15548f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
15558f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
15568f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
15578f7e8f9dSMark Adams   } else {
15588f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15598f7e8f9dSMark Adams   }
15608f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15618f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
15628f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15638f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15648f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15658f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15668f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15678f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15688f7e8f9dSMark Adams 
1569930e68a5SMark Adams   PetscFunctionReturn(0);
1570930e68a5SMark Adams }
1571930e68a5SMark Adams 
157286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1573930e68a5SMark Adams {
1574930e68a5SMark Adams   PetscErrorCode   ierr;
1575930e68a5SMark Adams 
1576930e68a5SMark Adams   PetscFunctionBegin;
1577930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
157886a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
157986a27549SJunchao Zhang   PetscFunctionReturn(0);
158086a27549SJunchao Zhang }
158186a27549SJunchao Zhang 
158286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
158386a27549SJunchao Zhang {
158486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
158586a27549SJunchao Zhang 
158686a27549SJunchao Zhang   PetscFunctionBegin;
158786a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
158886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
158986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
159086a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
159186a27549SJunchao Zhang   }
159286a27549SJunchao Zhang   PetscFunctionReturn(0);
159386a27549SJunchao Zhang }
159486a27549SJunchao Zhang 
159586a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
159686a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
159786a27549SJunchao Zhang {
159886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1599076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
160086a27549SJunchao Zhang 
160186a27549SJunchao Zhang   PetscFunctionBegin;
160286a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
160386a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1604076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
160586a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1606076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1607076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
160886a27549SJunchao Zhang 
160986a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1610076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1611076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1612076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
161386a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
161486a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
161586a27549SJunchao Zhang 
161686a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
161786a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
161886a27549SJunchao Zhang     */
1619076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1620076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
162186a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
162286a27549SJunchao Zhang 
162386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
162486a27549SJunchao Zhang 
162586a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1626076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
162786a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1628076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1629076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
163086a27549SJunchao Zhang 
163186a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1632076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1633076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1634076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
163586a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
163686a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
163786a27549SJunchao Zhang 
163886a27549SJunchao Zhang     /* Sort indices. See comments above */
1639076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1640076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
164186a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
164286a27549SJunchao Zhang 
164386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
164486a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
164586a27549SJunchao Zhang   }
164686a27549SJunchao Zhang   PetscFunctionReturn(0);
164786a27549SJunchao Zhang }
164886a27549SJunchao Zhang 
164986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
165086a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
165186a27549SJunchao Zhang {
165286a27549SJunchao Zhang   PetscErrorCode                 ierr;
165386a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
165486a27549SJunchao Zhang   PetscScalarKokkosView          xv;
165586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
165686a27549SJunchao Zhang 
165786a27549SJunchao Zhang   PetscFunctionBegin;
165886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
165986a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
166086a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
166186a27549SJunchao Zhang   /* Solve L tmpv = b */
16623f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
166386a27549SJunchao Zhang   /* Solve Ux = tmpv */
16643f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
166586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
166686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
166786a27549SJunchao Zhang   PetscFunctionReturn(0);
166886a27549SJunchao Zhang }
166986a27549SJunchao Zhang 
1670076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
167186a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
167286a27549SJunchao Zhang {
167386a27549SJunchao Zhang   PetscErrorCode                 ierr;
167486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
167586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
167686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
167786a27549SJunchao Zhang 
167886a27549SJunchao Zhang   PetscFunctionBegin;
167986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
168086a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
168186a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
168286a27549SJunchao Zhang   /* Solve U^T tmpv = b */
168386a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
168486a27549SJunchao Zhang 
168586a27549SJunchao Zhang   /* Solve L^T x = tmpv */
168686a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
168786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
168886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
168986a27549SJunchao Zhang   PetscFunctionReturn(0);
169086a27549SJunchao Zhang }
169186a27549SJunchao Zhang 
169286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
169386a27549SJunchao Zhang {
169486a27549SJunchao Zhang   PetscErrorCode                 ierr;
169586a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
169686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
169786a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
169886a27549SJunchao Zhang 
169986a27549SJunchao Zhang   PetscFunctionBegin;
170086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1701076ba34aSJunchao Zhang 
1702076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1703076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1704076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1705076ba34aSJunchao Zhang 
1706076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
170786a27549SJunchao Zhang 
170886a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
170986a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
171086a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
171186a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
171286a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
171386a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
171486a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
171586a27549SJunchao Zhang 
171686a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
171786a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
171886a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
171986a27549SJunchao Zhang   PetscFunctionReturn(0);
172086a27549SJunchao Zhang }
172186a27549SJunchao Zhang 
172286a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
172386a27549SJunchao Zhang {
172486a27549SJunchao Zhang   PetscErrorCode                 ierr;
172586a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
172686a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
172786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
172886a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
172986a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
173086a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
173186a27549SJunchao Zhang 
173286a27549SJunchao Zhang   PetscFunctionBegin;
173386a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
173486a27549SJunchao Zhang   /* Rebuild factors */
173586a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
173686a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
173786a27549SJunchao Zhang 
173886a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
173986a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
174086a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
174186a27549SJunchao Zhang 
174286a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
174386a27549SJunchao Zhang 
174486a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
174586a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
174686a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
174786a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
174886a27549SJunchao Zhang 
174986a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1750076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1751076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1752076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
175386a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
175486a27549SJunchao Zhang 
175586a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
175686a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
175786a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
175886a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
175986a27549SJunchao Zhang 
176086a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
176186a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
176286a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
176386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
176486a27549SJunchao Zhang  #else
176586a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
176686a27549SJunchao Zhang  #endif
176786a27549SJunchao Zhang 
176886a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
176986a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
177086a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
177186a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
177286a27549SJunchao Zhang 
177386a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
177486a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
177586a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
177686a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
177786a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
177886a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
177986a27549SJunchao Zhang 
178086a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
178186a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1782930e68a5SMark Adams   PetscFunctionReturn(0);
1783930e68a5SMark Adams }
1784930e68a5SMark Adams 
17858f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17868f7e8f9dSMark Adams {
17878f7e8f9dSMark Adams   PetscErrorCode   ierr;
17888f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17898f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1790930e68a5SMark Adams 
17918f7e8f9dSMark Adams   PetscFunctionBegin;
17928f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
17938f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17948f7e8f9dSMark Adams   // move B data into Kokkos
17958f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
17968f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
17978f7e8f9dSMark Adams   {
17988f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
17998f7e8f9dSMark Adams     if (!baijkok->diag_d) {
18008f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1801152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
18028f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
18038f7e8f9dSMark Adams     }
18048f7e8f9dSMark Adams   }
18058f7e8f9dSMark Adams   PetscFunctionReturn(0);
18068f7e8f9dSMark Adams }
18078f7e8f9dSMark Adams 
180886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1809930e68a5SMark Adams {
1810930e68a5SMark Adams   PetscFunctionBegin;
1811930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1812930e68a5SMark Adams   PetscFunctionReturn(0);
1813930e68a5SMark Adams }
1814930e68a5SMark Adams 
18158f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
18168f7e8f9dSMark Adams {
18178f7e8f9dSMark Adams   PetscFunctionBegin;
18188f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
18198f7e8f9dSMark Adams   PetscFunctionReturn(0);
18208f7e8f9dSMark Adams }
18218f7e8f9dSMark Adams 
1822930e68a5SMark Adams /*MC
182386a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
182486a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1825930e68a5SMark Adams 
1826930e68a5SMark Adams   Level: beginner
1827930e68a5SMark Adams 
182886a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1829930e68a5SMark Adams M*/
183086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1831930e68a5SMark Adams {
1832930e68a5SMark Adams   PetscErrorCode ierr;
1833930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1834930e68a5SMark Adams 
1835930e68a5SMark Adams   PetscFunctionBegin;
1836930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1837930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1838930e68a5SMark Adams   (*B)->factortype = ftype;
18394ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1840930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1841930e68a5SMark Adams 
18428f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1843930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
184486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
184586a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
184686a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
184786a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
184886a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
184986a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
185086a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1851930e68a5SMark Adams 
1852930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
185386a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1854930e68a5SMark Adams   PetscFunctionReturn(0);
1855930e68a5SMark Adams }
18568f7e8f9dSMark Adams 
18578f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
18588f7e8f9dSMark Adams {
18598f7e8f9dSMark Adams   PetscErrorCode ierr;
18608f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
18618f7e8f9dSMark Adams 
18628f7e8f9dSMark Adams   PetscFunctionBegin;
18638f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
18648f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
18658f7e8f9dSMark Adams   (*B)->factortype = ftype;
1866f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
18674ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
18688f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
18698f7e8f9dSMark Adams 
18708f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18718f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
18728f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
18738f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
18748f7e8f9dSMark Adams 
18758f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
18768f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
18778f7e8f9dSMark Adams   PetscFunctionReturn(0);
18788f7e8f9dSMark Adams }
187986a27549SJunchao Zhang 
188086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
188186a27549SJunchao Zhang {
188286a27549SJunchao Zhang   PetscErrorCode ierr;
188386a27549SJunchao Zhang 
188486a27549SJunchao Zhang   PetscFunctionBegin;
188586a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
188686a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
188786a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
188886a27549SJunchao Zhang   PetscFunctionReturn(0);
188986a27549SJunchao Zhang }
189086a27549SJunchao Zhang 
1891076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1892076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1893076ba34aSJunchao Zhang {
1894076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1895076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1896076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1897076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1898076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1899076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1900076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1901076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1902076ba34aSJunchao Zhang 
1903076ba34aSJunchao Zhang   PetscFunctionBegin;
1904c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1905076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1906c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1907076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1908c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1909076ba34aSJunchao Zhang     }
1910076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1911076ba34aSJunchao Zhang   }
1912076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1913076ba34aSJunchao Zhang }
1914