xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 580c7c76e62496a283478b0c6a56200bce7b613e)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
1942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
208c3ff71bSJunchao Zhang 
218c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
228c3ff71bSJunchao Zhang 
23076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
26076ba34aSJunchao Zhang  */
278c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
288c3ff71bSJunchao Zhang {
298c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
30076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
31076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
328c3ff71bSJunchao Zhang 
338c3ff71bSJunchao Zhang   PetscFunctionBegin;
34076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
358c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36076ba34aSJunchao Zhang 
37076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39076ba34aSJunchao Zhang 
40076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
41076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
43076ba34aSJunchao Zhang   */
44076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45076ba34aSJunchao Zhang     delete aijkok;
46076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47076ba34aSJunchao Zhang     A->spptr = aijkok;
48076ba34aSJunchao Zhang   }
49076ba34aSJunchao Zhang 
50a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
51a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52a587d139SMark   }
538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
548c3ff71bSJunchao Zhang }
558c3ff71bSJunchao Zhang 
5686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
57076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
588c3ff71bSJunchao Zhang {
598c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
608c3ff71bSJunchao Zhang 
618c3ff71bSJunchao Zhang   PetscFunctionBegin;
622c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
632c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
642c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
66076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
67*580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6886a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
698c3ff71bSJunchao Zhang   }
708c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
718c3ff71bSJunchao Zhang }
728c3ff71bSJunchao Zhang 
73076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
74fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7586a27549SJunchao Zhang {
7686a27549SJunchao Zhang   PetscErrorCode   ierr;
7786a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7886a27549SJunchao Zhang 
7986a27549SJunchao Zhang   PetscFunctionBegin;
802c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8186a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8286a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8386a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
852328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8786a27549SJunchao Zhang   PetscFunctionReturn(0);
8886a27549SJunchao Zhang }
8986a27549SJunchao Zhang 
90f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91f0cf5187SStefano Zampini {
92f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93f0cf5187SStefano Zampini 
94f0cf5187SStefano Zampini   PetscFunctionBegin;
95f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9686a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->factortype != MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
100f0cf5187SStefano Zampini   PetscFunctionReturn(0);
101f0cf5187SStefano Zampini }
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104f0cf5187SStefano Zampini {
105076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106f0cf5187SStefano Zampini 
107f0cf5187SStefano Zampini   PetscFunctionBegin;
108076ba34aSJunchao Zhang   if (aijkok) {
109076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
110076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
111076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
112076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113076ba34aSJunchao Zhang   }
114076ba34aSJunchao Zhang   PetscFunctionReturn(0);
115076ba34aSJunchao Zhang }
116076ba34aSJunchao Zhang 
117076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118076ba34aSJunchao Zhang {
119076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120076ba34aSJunchao Zhang 
121076ba34aSJunchao Zhang   PetscFunctionBegin;
122076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
123076ba34aSJunchao Zhang   PetscFunctionReturn(0);
124076ba34aSJunchao Zhang }
125076ba34aSJunchao Zhang 
126076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127076ba34aSJunchao Zhang {
128076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129076ba34aSJunchao Zhang 
130076ba34aSJunchao Zhang   PetscFunctionBegin;
1312328674fSJunchao Zhang   if (aijkok) {
132076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
133076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1342328674fSJunchao Zhang   } else {
1352328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1362328674fSJunchao Zhang   }
137076ba34aSJunchao Zhang   PetscFunctionReturn(0);
138076ba34aSJunchao Zhang }
139076ba34aSJunchao Zhang 
140076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141076ba34aSJunchao Zhang {
142076ba34aSJunchao Zhang   PetscFunctionBegin;
143076ba34aSJunchao Zhang   *array = NULL;
144076ba34aSJunchao Zhang   PetscFunctionReturn(0);
145076ba34aSJunchao Zhang }
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148076ba34aSJunchao Zhang {
149076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150076ba34aSJunchao Zhang 
151076ba34aSJunchao Zhang   PetscFunctionBegin;
1522328674fSJunchao Zhang   if (aijkok) {
153076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1542328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1552328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1562328674fSJunchao Zhang   }
157076ba34aSJunchao Zhang   PetscFunctionReturn(0);
158076ba34aSJunchao Zhang }
159076ba34aSJunchao Zhang 
160076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161076ba34aSJunchao Zhang {
162076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163076ba34aSJunchao Zhang 
164076ba34aSJunchao Zhang   PetscFunctionBegin;
1652328674fSJunchao Zhang   if (aijkok) {
166076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
167076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1682328674fSJunchao Zhang   }
169f0cf5187SStefano Zampini   PetscFunctionReturn(0);
170f0cf5187SStefano Zampini }
171f0cf5187SStefano Zampini 
172a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174a587d139SMark {
175042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
1792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182a587d139SMark   PetscFunctionReturn(0);
183a587d139SMark }
184a587d139SMark 
185a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187a587d139SMark {
188042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189a587d139SMark 
190a587d139SMark   PetscFunctionBegin;
191a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
192a587d139SMark     *d_mat = aijkok->device_mat_d.data();
193a587d139SMark   } else {
194a587d139SMark     PetscErrorCode   ierr;
195a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196a587d139SMark     *d_mat  = NULL;
197a587d139SMark   }
198a587d139SMark   PetscFunctionReturn(0);
199a587d139SMark }
200076ba34aSJunchao Zhang 
201076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
202076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203152b3e56SJunchao Zhang {
204152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205152b3e56SJunchao Zhang 
206152b3e56SJunchao Zhang   PetscFunctionBegin;
2072c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
211076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21386a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
214076ba34aSJunchao Zhang   }
215076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
216152b3e56SJunchao Zhang   PetscFunctionReturn(0);
217152b3e56SJunchao Zhang }
218152b3e56SJunchao Zhang 
219076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
220076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221152b3e56SJunchao Zhang {
222eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
223152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
224152b3e56SJunchao Zhang 
225152b3e56SJunchao Zhang   PetscFunctionBegin;
226eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2272c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
228076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
229076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
230076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
231076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
232076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
233076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
234076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
235076ba34aSJunchao Zhang    #endif
23686a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
237076ba34aSJunchao Zhang   }
238076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
239eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240152b3e56SJunchao Zhang   PetscFunctionReturn(0);
241152b3e56SJunchao Zhang }
242a587d139SMark 
2438c3ff71bSJunchao Zhang /* y = A x */
2448c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2458c3ff71bSJunchao Zhang {
2468c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2478c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
248152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
249152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2508c3ff71bSJunchao Zhang 
2518c3ff71bSJunchao Zhang   PetscFunctionBegin;
2528c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256eeadb341SJunchao Zhang   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
257152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
258152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
259152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
260076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
261152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2638c3ff71bSJunchao Zhang }
2648c3ff71bSJunchao Zhang 
2658c3ff71bSJunchao Zhang /* y = A^T x */
2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2678c3ff71bSJunchao Zhang {
2688c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2698c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
270152b3e56SJunchao Zhang   const char                       *mode;
271152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
272152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
273076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang   PetscFunctionBegin;
276eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
280152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
281076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282152b3e56SJunchao Zhang     mode = "N";
283152b3e56SJunchao Zhang   } else {
284076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
286152b3e56SJunchao Zhang     mode = "T";
287152b3e56SJunchao Zhang   }
288076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
291076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2938c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2948c3ff71bSJunchao Zhang }
2958c3ff71bSJunchao Zhang 
2968c3ff71bSJunchao Zhang /* y = A^H x */
2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2988c3ff71bSJunchao Zhang {
2998c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3008c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
301152b3e56SJunchao Zhang   const char                       *mode;
302152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
303152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
304076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3058c3ff71bSJunchao Zhang 
3068c3ff71bSJunchao Zhang   PetscFunctionBegin;
307eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
311152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
312076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313152b3e56SJunchao Zhang     mode = "N";
314152b3e56SJunchao Zhang   } else {
315076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
317152b3e56SJunchao Zhang     mode = "C";
318152b3e56SJunchao Zhang   }
319076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
322076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3258c3ff71bSJunchao Zhang }
3268c3ff71bSJunchao Zhang 
3278c3ff71bSJunchao Zhang /* z = A x + y */
3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3298c3ff71bSJunchao Zhang {
3308c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3318c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
332152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
333152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3348c3ff71bSJunchao Zhang 
3358c3ff71bSJunchao Zhang   PetscFunctionBegin;
336eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3378c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3418c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3428c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3508c3ff71bSJunchao Zhang }
3518c3ff71bSJunchao Zhang 
3528c3ff71bSJunchao Zhang /* z = A^T x + y */
3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3548c3ff71bSJunchao Zhang {
3558c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3568c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
357152b3e56SJunchao Zhang   const char                       *mode;
358152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
359152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
360076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3618c3ff71bSJunchao Zhang 
3628c3ff71bSJunchao Zhang   PetscFunctionBegin;
363eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3688c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
369152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
370076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371152b3e56SJunchao Zhang     mode = "N";
372152b3e56SJunchao Zhang   } else {
373076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
375152b3e56SJunchao Zhang     mode = "T";
376152b3e56SJunchao Zhang   }
377076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
381076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3838c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3848c3ff71bSJunchao Zhang }
3858c3ff71bSJunchao Zhang 
3868c3ff71bSJunchao Zhang /* z = A^H x + y */
3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3888c3ff71bSJunchao Zhang {
3898c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3908c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
391152b3e56SJunchao Zhang   const char                       *mode;
392152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
393152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
394076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3958c3ff71bSJunchao Zhang 
3968c3ff71bSJunchao Zhang   PetscFunctionBegin;
397eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3988c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
4028c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
403152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
404076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405152b3e56SJunchao Zhang     mode = "N";
406152b3e56SJunchao Zhang   } else {
407076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
409152b3e56SJunchao Zhang     mode = "C";
410152b3e56SJunchao Zhang   }
411076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
415076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417152b3e56SJunchao Zhang   PetscFunctionReturn(0);
418152b3e56SJunchao Zhang }
419152b3e56SJunchao Zhang 
420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421152b3e56SJunchao Zhang {
422152b3e56SJunchao Zhang   PetscErrorCode            ierr;
423152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424152b3e56SJunchao Zhang 
425152b3e56SJunchao Zhang   PetscFunctionBegin;
426152b3e56SJunchao Zhang   switch (op) {
427152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
428152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
431152b3e56SJunchao Zhang       break;
432152b3e56SJunchao Zhang     default:
433152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434152b3e56SJunchao Zhang       break;
435152b3e56SJunchao Zhang   }
4368c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4378c3ff71bSJunchao Zhang }
4388c3ff71bSJunchao Zhang 
439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4418c3ff71bSJunchao Zhang {
4428c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
443076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4448c3ff71bSJunchao Zhang 
4458c3ff71bSJunchao Zhang   PetscFunctionBegin;
446a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4498c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
4522c71b3e2SJacob Faibussowitsch     PetscCheckFalse(A != *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
4592c71b3e2SJacob Faibussowitsch       PetscCheckFalse(A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4618c3ff71bSJunchao Zhang     }
462076ba34aSJunchao Zhang   }
4638c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4648c3ff71bSJunchao Zhang }
4658c3ff71bSJunchao Zhang 
466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468076ba34aSJunchao Zhang  */
469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4708c3ff71bSJunchao Zhang {
4718c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
472076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
473076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474076ba34aSJunchao Zhang   Mat                   mat;
4758c3ff71bSJunchao Zhang 
4768c3ff71bSJunchao Zhang   PetscFunctionBegin;
477076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479076ba34aSJunchao Zhang   mat  = *B;
480076ba34aSJunchao Zhang   if (A->assembled) {
481076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484076ba34aSJunchao Zhang     /* Now copy values to B if needed */
485076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
486076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
487076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
489076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
490076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
492076ba34aSJunchao Zhang       }
493076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
496076ba34aSJunchao Zhang     }
497076ba34aSJunchao Zhang     mat->spptr = bkok;
498076ba34aSJunchao Zhang   }
499076ba34aSJunchao Zhang 
500076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5058c3ff71bSJunchao Zhang }
5068c3ff71bSJunchao Zhang 
5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5080ecb592aSJunchao Zhang {
5090ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5100ecb592aSJunchao Zhang   Mat               At;
511ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
5120ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5130ecb592aSJunchao Zhang 
5140ecb592aSJunchao Zhang   PetscFunctionBegin;
5150ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5160ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
517ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
518ff751488SJunchao Zhang     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
5190ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5200ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5217a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5220ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5230ecb592aSJunchao Zhang     if ((*B)->assembled) {
5240ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5250ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5260ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5270ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5280ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5290ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5300ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5320ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5330ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5340ecb592aSJunchao Zhang   }
5350ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5360ecb592aSJunchao Zhang }
5370ecb592aSJunchao Zhang 
5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5398c3ff71bSJunchao Zhang {
5408c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5428c3ff71bSJunchao Zhang 
5438c3ff71bSJunchao Zhang   PetscFunctionBegin;
54486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5468c3ff71bSJunchao Zhang     delete aijkok;
54786a27549SJunchao Zhang   } else {
54886a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
54986a27549SJunchao Zhang   }
550152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
55142550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
55242550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
553152b3e56SJunchao Zhang   A->spptr = NULL;
5548c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5558c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5568c3ff71bSJunchao Zhang }
5578c3ff71bSJunchao Zhang 
55886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
55986a27549SJunchao Zhang {
56086a27549SJunchao Zhang   PetscErrorCode ierr;
56186a27549SJunchao Zhang 
56286a27549SJunchao Zhang   PetscFunctionBegin;
56386a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
56486a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
56586a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
56686a27549SJunchao Zhang   PetscFunctionReturn(0);
56786a27549SJunchao Zhang }
56886a27549SJunchao Zhang 
569076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
570076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
571a3f881fbSStefano Zampini {
572076ba34aSJunchao Zhang   PetscErrorCode               ierr;
573076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
574076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
575076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
576076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
577076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
578076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
579a3f881fbSStefano Zampini 
580a3f881fbSStefano Zampini   PetscFunctionBegin;
581076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
582076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
583076ba34aSJunchao Zhang   PetscValidPointer(C,4);
584076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
585076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
5862c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5872c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
588076ba34aSJunchao Zhang 
589076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
590076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
591076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
592076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
593076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
594076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
595076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
596076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
597076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
598076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
599076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
600076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
601076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
602076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
603076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
604076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
605076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
606076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
607076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
608076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
609076ba34aSJunchao Zhang     ca = ca_dual.view_device();
610076ba34aSJunchao Zhang     ci = ci_dual.view_device();
611076ba34aSJunchao Zhang     cj = cj_dual.view_device();
612076ba34aSJunchao Zhang 
613076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
614076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
615076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
616076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
617076ba34aSJunchao Zhang 
618076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
619076ba34aSJunchao Zhang         ci(i) = coffset;
620076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
621076ba34aSJunchao Zhang       });
622076ba34aSJunchao Zhang 
623076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
624076ba34aSJunchao Zhang         if (k < alen) {
625076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
626076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
627076ba34aSJunchao Zhang         } else {
628076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
629076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
630076ba34aSJunchao Zhang         }
631076ba34aSJunchao Zhang       });
632076ba34aSJunchao Zhang     });
633076ba34aSJunchao Zhang     ca_dual.modify_device();
634076ba34aSJunchao Zhang     ci_dual.modify_device();
635076ba34aSJunchao Zhang     cj_dual.modify_device();
636076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
637076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
638076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
639076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
640076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
641076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
642076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
643076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
644076ba34aSJunchao Zhang 
645076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
646076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
647076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
648076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
649076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
650076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
651076ba34aSJunchao Zhang       });
652076ba34aSJunchao Zhang     });
653076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
654076ba34aSJunchao Zhang   }
655076ba34aSJunchao Zhang   PetscFunctionReturn(0);
656076ba34aSJunchao Zhang }
657076ba34aSJunchao Zhang 
658076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
659076ba34aSJunchao Zhang {
660076ba34aSJunchao Zhang   PetscFunctionBegin;
661076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
662a3f881fbSStefano Zampini   PetscFunctionReturn(0);
663a3f881fbSStefano Zampini }
664a3f881fbSStefano Zampini 
665a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
666a3f881fbSStefano Zampini {
667076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
668a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
669a3f881fbSStefano Zampini   Mat                            A,B;
670076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
671a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
672a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
673076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
674076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
675a3f881fbSStefano Zampini 
676a3f881fbSStefano Zampini   PetscFunctionBegin;
677a3f881fbSStefano Zampini   MatCheckProduct(C,1);
6782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
679076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
680076ba34aSJunchao Zhang 
681076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
682076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
683076ba34aSJunchao Zhang     PetscFunctionReturn(0);
684076ba34aSJunchao Zhang   }
685076ba34aSJunchao Zhang 
686076ba34aSJunchao Zhang   switch (product->type) {
687076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
688076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
689076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
690076ba34aSJunchao Zhang     default:
69198921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
692076ba34aSJunchao Zhang   }
693076ba34aSJunchao Zhang 
694a3f881fbSStefano Zampini   A     = product->A;
695a3f881fbSStefano Zampini   B     = product->B;
696a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
697a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
698a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
699a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
700a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
701076ba34aSJunchao Zhang 
7022c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
703076ba34aSJunchao Zhang 
704076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
705076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
706076ba34aSJunchao Zhang 
707076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
708076ba34aSJunchao Zhang   if (transA) {
709076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
710076ba34aSJunchao Zhang     transA = false;
711a3f881fbSStefano Zampini   }
712a3f881fbSStefano Zampini 
713076ba34aSJunchao Zhang   if (transB) {
714076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
715076ba34aSJunchao Zhang     transB = false;
716076ba34aSJunchao Zhang   }
717eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
718076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
719076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
720eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
721076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
722a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
723a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
7247d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
725a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
7267d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
727a3f881fbSStefano Zampini   c->reallocs         = 0;
728076ba34aSJunchao Zhang   C->info.mallocs     = 0;
729a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
730a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
731a3f881fbSStefano Zampini   C->num_ass++;
732a3f881fbSStefano Zampini   PetscFunctionReturn(0);
733a3f881fbSStefano Zampini }
734a3f881fbSStefano Zampini 
735a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
736a3f881fbSStefano Zampini {
737a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
738076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
739076ba34aSJunchao Zhang   MatProductType                 ptype;
740076ba34aSJunchao Zhang   Mat                            A,B;
741076ba34aSJunchao Zhang   bool                           transA,transB;
742076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
743076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
744076ba34aSJunchao Zhang   MPI_Comm                       comm;
745076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
746a3f881fbSStefano Zampini 
747a3f881fbSStefano Zampini   PetscFunctionBegin;
748a3f881fbSStefano Zampini   MatCheckProduct(C,1);
749076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
7502c71b3e2SJacob Faibussowitsch   PetscCheckFalse(product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
751a3f881fbSStefano Zampini   A       = product->A;
752a3f881fbSStefano Zampini   B       = product->B;
753a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
754a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
755a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
756a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
757076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
758076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
759076ba34aSJunchao Zhang 
760a3f881fbSStefano Zampini   ptype   = product->type;
761a3f881fbSStefano Zampini   switch (ptype) {
762076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
763076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
764076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
765a3f881fbSStefano Zampini     default:
76698921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
767a3f881fbSStefano Zampini   }
768a3f881fbSStefano Zampini 
769076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
770076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
771076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
772076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
773a3f881fbSStefano Zampini 
774076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
775076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7766ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7776ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7786ffb9508SJunchao Zhang    */
779076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
780076ba34aSJunchao Zhang 
781eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
782076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
783076ba34aSJunchao Zhang   if (transA) {
784076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
785076ba34aSJunchao Zhang     transA = false;
786076ba34aSJunchao Zhang   }
787076ba34aSJunchao Zhang 
788076ba34aSJunchao Zhang   if (transB) {
789076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
790076ba34aSJunchao Zhang     transB = false;
791076ba34aSJunchao Zhang   }
792076ba34aSJunchao Zhang 
793076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
794076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
795076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
796076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
797076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
798076ba34aSJunchao Zhang   */
799076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
800076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
801eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
802076ba34aSJunchao Zhang 
803076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
804076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
805076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
806a3f881fbSStefano Zampini   PetscFunctionReturn(0);
807a3f881fbSStefano Zampini }
808a3f881fbSStefano Zampini 
809a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
810076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
811a3f881fbSStefano Zampini {
812a3f881fbSStefano Zampini   PetscErrorCode ierr;
813076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
814a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
815a3f881fbSStefano Zampini 
816a3f881fbSStefano Zampini   PetscFunctionBegin;
817a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
818a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
819a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
820a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
821a3f881fbSStefano Zampini   }
822a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
823a3f881fbSStefano Zampini     switch (product->type) {
824a3f881fbSStefano Zampini     case MATPRODUCT_AB:
825a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
826a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
827a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
828a3f881fbSStefano Zampini       break;
829a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
830a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
831a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
832a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
833a3f881fbSStefano Zampini       break;
834a3f881fbSStefano Zampini     default:
835a3f881fbSStefano Zampini       break;
836a3f881fbSStefano Zampini     }
837a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
838a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
839a3f881fbSStefano Zampini   }
840a3f881fbSStefano Zampini   PetscFunctionReturn(0);
841a3f881fbSStefano Zampini }
842a587d139SMark 
843f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
844f0cf5187SStefano Zampini {
845f0cf5187SStefano Zampini   PetscErrorCode   ierr;
846f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
847f0cf5187SStefano Zampini 
848f0cf5187SStefano Zampini   PetscFunctionBegin;
849eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
850f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
851f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
852076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
853076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
854076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
855eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
856f0cf5187SStefano Zampini   PetscFunctionReturn(0);
857f0cf5187SStefano Zampini }
858f0cf5187SStefano Zampini 
859a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
860a587d139SMark {
861a587d139SMark   PetscErrorCode   ierr;
862076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
863a587d139SMark 
864a587d139SMark   PetscFunctionBegin;
865076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8662328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
867076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
868076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8692328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8702328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8712328674fSJunchao Zhang   }
872a587d139SMark   PetscFunctionReturn(0);
873a587d139SMark }
874a587d139SMark 
875db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
87642550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
877db78de30SJunchao Zhang {
878db78de30SJunchao Zhang   PetscErrorCode     ierr;
879db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
880db78de30SJunchao Zhang 
881db78de30SJunchao Zhang   PetscFunctionBegin;
882db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
883db78de30SJunchao Zhang   PetscValidPointer(kv,2);
884db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
885db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
886db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
888db78de30SJunchao Zhang   PetscFunctionReturn(0);
889db78de30SJunchao Zhang }
890db78de30SJunchao Zhang 
89142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
892db78de30SJunchao Zhang {
893db78de30SJunchao Zhang   PetscFunctionBegin;
894db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
895db78de30SJunchao Zhang   PetscValidPointer(kv,2);
896db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
897db78de30SJunchao Zhang   PetscFunctionReturn(0);
898db78de30SJunchao Zhang }
899db78de30SJunchao Zhang 
90042550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
901db78de30SJunchao Zhang {
902db78de30SJunchao Zhang   PetscErrorCode     ierr;
903db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
904db78de30SJunchao Zhang 
905db78de30SJunchao Zhang   PetscFunctionBegin;
906db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
907db78de30SJunchao Zhang   PetscValidPointer(kv,2);
908db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
909db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
910db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
911076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
912db78de30SJunchao Zhang   PetscFunctionReturn(0);
913db78de30SJunchao Zhang }
914db78de30SJunchao Zhang 
91542550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
916db78de30SJunchao Zhang {
917db78de30SJunchao Zhang   PetscErrorCode     ierr;
918db78de30SJunchao Zhang 
919db78de30SJunchao Zhang   PetscFunctionBegin;
920db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
921db78de30SJunchao Zhang   PetscValidPointer(kv,2);
922db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
923076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
924db78de30SJunchao Zhang   PetscFunctionReturn(0);
925db78de30SJunchao Zhang }
926db78de30SJunchao Zhang 
92742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
928db78de30SJunchao Zhang {
929db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
930db78de30SJunchao Zhang 
931db78de30SJunchao Zhang   PetscFunctionBegin;
932db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
933db78de30SJunchao Zhang   PetscValidPointer(kv,2);
934db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
935db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
936076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
937db78de30SJunchao Zhang   PetscFunctionReturn(0);
938db78de30SJunchao Zhang }
939db78de30SJunchao Zhang 
94042550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
941db78de30SJunchao Zhang {
942db78de30SJunchao Zhang   PetscErrorCode     ierr;
943db78de30SJunchao Zhang 
944db78de30SJunchao Zhang   PetscFunctionBegin;
945db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
946db78de30SJunchao Zhang   PetscValidPointer(kv,2);
947db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
948076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
949db78de30SJunchao Zhang   PetscFunctionReturn(0);
950db78de30SJunchao Zhang }
951db78de30SJunchao Zhang 
952c17cf699SJunchao Zhang /* Computes Y += alpha X */
953c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
954a587d139SMark {
955a587d139SMark   PetscErrorCode             ierr;
956a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
957c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
958c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
959c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
960a587d139SMark 
961a587d139SMark   PetscFunctionBegin;
962c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
963c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
964c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
965c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
966eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
967db78de30SJunchao Zhang 
968c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
969c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
970a587d139SMark     PetscBool e;
971a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
972a587d139SMark     if (e) {
973a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
974c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
975a587d139SMark     }
976a587d139SMark   }
977db78de30SJunchao Zhang 
978c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
979c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
980c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
981c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
982c17cf699SJunchao Zhang   */
983c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
984c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
985c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
986c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
987c17cf699SJunchao Zhang 
988c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
989c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
990c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
991c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
992c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
993c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
994c17cf699SJunchao Zhang 
995c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
996c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
997c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
998c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
999c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1000c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1001c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1002c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1003c17cf699SJunchao Zhang             q++;
1004a587d139SMark           } else {
1005c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1006c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1007c17cf699SJunchao Zhang             */
1008c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1009a587d139SMark           }
1010c17cf699SJunchao Zhang         }
1011c17cf699SJunchao Zhang       });
1012c17cf699SJunchao Zhang     });
1013c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1014c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1015c17cf699SJunchao Zhang     Mat             Z;
1016c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1017c17cf699SJunchao Zhang     KernelHandle    kh;
1018c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1019c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1020c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1021c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1022c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1023c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1024c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1025c17cf699SJunchao Zhang   }
1026eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1027eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1028a587d139SMark   PetscFunctionReturn(0);
1029a587d139SMark }
1030a587d139SMark 
103142550becSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
103242550becSJunchao Zhang {
103342550becSJunchao Zhang   PetscErrorCode            ierr;
103442550becSJunchao Zhang   MPI_Comm                  comm;
103542550becSJunchao Zhang   PetscInt                  *i,*j,*perm,*jmap;
103642550becSJunchao Zhang   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
103742550becSJunchao Zhang   PetscBool                 has_repeats = PETSC_FALSE;
103842550becSJunchao Zhang   PetscInt                  *Ai,*Aj;
103942550becSJunchao Zhang   PetscScalar               *Aa;
104042550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
104142550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
104242550becSJunchao Zhang   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
104342550becSJunchao Zhang   Mat                       newmat;
104442550becSJunchao Zhang 
104542550becSJunchao Zhang   PetscFunctionBegin;
104642550becSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
104742550becSJunchao Zhang   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
104842550becSJunchao Zhang   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
104942550becSJunchao Zhang   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
105042550becSJunchao Zhang   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
105142550becSJunchao Zhang   perm = perm_h.data();
105242550becSJunchao Zhang   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
105342550becSJunchao Zhang     if (j[k] < 0) i[k] = -1;
105442550becSJunchao Zhang     perm[k] = k;
105542550becSJunchao Zhang   }
105642550becSJunchao Zhang 
105742550becSJunchao Zhang   /* Sort by row */
105842550becSJunchao Zhang   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
105942550becSJunchao Zhang   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
106042550becSJunchao Zhang   nneg = k;
106142550becSJunchao Zhang   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
106242550becSJunchao Zhang   nnz  = 0; /* Total number of unique nonzeros to be counted */
106342550becSJunchao Zhang   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
106442550becSJunchao Zhang   jmap++; /* Inc jmap by 1 for convinience */
106542550becSJunchao Zhang 
106642550becSJunchao Zhang   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
106742550becSJunchao Zhang   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
106842550becSJunchao Zhang 
106942550becSJunchao Zhang   /* In each row, sort by column, then unique column indices to get row length */
107042550becSJunchao Zhang   Ai++; /* Inc by 1 for convinience */
107142550becSJunchao Zhang   q = 0; /* q-th unique nonzero, with q starting from 0 */
107242550becSJunchao Zhang   while (k<n) {
107342550becSJunchao Zhang     row   = i[k];
107442550becSJunchao Zhang     start = k; /* [start,end) indices for this row */
107542550becSJunchao Zhang     while (k<n && i[k] == row) k++;
107642550becSJunchao Zhang     end   = k;
107742550becSJunchao Zhang     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
107842550becSJunchao Zhang     /* Find number of unique col entries in this row */
107942550becSJunchao Zhang     Aj[q]   = j[start]; /* Log the first nonzero in this row */
108042550becSJunchao Zhang     jmap[q] = 1; /* Number of repeats of this nozero entry */
108142550becSJunchao Zhang     Ai[row] = 1;
108242550becSJunchao Zhang     nnz++;
108342550becSJunchao Zhang 
108442550becSJunchao Zhang     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
108542550becSJunchao Zhang       if (j[p] != j[p-1]) { /* Meet a new nonzero */
108642550becSJunchao Zhang         q++;
108742550becSJunchao Zhang         jmap[q] = 1;
108842550becSJunchao Zhang         Aj[q]   = j[p];
108942550becSJunchao Zhang         Ai[row]++;
109042550becSJunchao Zhang         nnz++;
109142550becSJunchao Zhang       } else {
109242550becSJunchao Zhang         jmap[q]++;
109342550becSJunchao Zhang         has_repeats = PETSC_TRUE;
109442550becSJunchao Zhang       }
109542550becSJunchao Zhang     }
109642550becSJunchao Zhang     q++; /* Move to next row and thus next unique nonzero */
109742550becSJunchao Zhang   }
109842550becSJunchao Zhang   ierr = PetscFree2(i,j);CHKERRQ(ierr);
109942550becSJunchao Zhang 
110042550becSJunchao Zhang   Ai--; /* Back to the beginning of Ai[] */
110142550becSJunchao Zhang   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
110242550becSJunchao Zhang   jmap--; /* Back to the beginning of jmap[] */
110342550becSJunchao Zhang   jmap[0] = 0;
110442550becSJunchao Zhang   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
110542550becSJunchao Zhang     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
110642550becSJunchao Zhang     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
110742550becSJunchao Zhang   }
110842550becSJunchao Zhang 
110942550becSJunchao Zhang   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
111042550becSJunchao Zhang     PetscInt *Aj_new;
111142550becSJunchao Zhang     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
111242550becSJunchao Zhang     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
111342550becSJunchao Zhang     ierr = PetscFree(Aj);CHKERRQ(ierr);
111442550becSJunchao Zhang     Aj = Aj_new;
111542550becSJunchao Zhang   }
111642550becSJunchao Zhang 
111742550becSJunchao Zhang   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
111842550becSJunchao Zhang   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
111942550becSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
112042550becSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
112142550becSJunchao Zhang   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
112242550becSJunchao Zhang 
112342550becSJunchao Zhang   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
112442550becSJunchao Zhang   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
112542550becSJunchao Zhang   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
112642550becSJunchao Zhang 
112742550becSJunchao Zhang   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
112842550becSJunchao Zhang     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
112942550becSJunchao Zhang     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
113042550becSJunchao Zhang     perm_h = newperm_h;
113142550becSJunchao Zhang   }
113242550becSJunchao Zhang 
113342550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
113442550becSJunchao Zhang   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
113542550becSJunchao Zhang   PetscFunctionReturn(0);
113642550becSJunchao Zhang }
113742550becSJunchao Zhang 
113842550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
113942550becSJunchao Zhang {
114042550becSJunchao Zhang   PetscErrorCode              ierr;
114142550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
114242550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
114342550becSJunchao Zhang   PetscInt                    nz = aseq->nz;
114442550becSJunchao Zhang   const MatRowMapKokkosView&  jmap = akok->jmap_d;
114542550becSJunchao Zhang   const MatRowMapKokkosView&  perm = akok->perm_d;
114642550becSJunchao Zhang   MatScalarKokkosView         Aa;
114742550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
114842550becSJunchao Zhang   PetscMemType                memtype;
114942550becSJunchao Zhang 
115042550becSJunchao Zhang   PetscFunctionBegin;
115142550becSJunchao Zhang   if (!v) { /* NULL v means an all zero array */
115242550becSJunchao Zhang     ierr = MatZeroEntries(A);CHKERRQ(ierr);
115342550becSJunchao Zhang     PetscFunctionReturn(0);
115442550becSJunchao Zhang   }
115542550becSJunchao Zhang 
115642550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
115742550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
115842550becSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
115942550becSJunchao Zhang   } else {
116042550becSJunchao Zhang     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
116142550becSJunchao Zhang   }
116242550becSJunchao Zhang 
116342550becSJunchao Zhang   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
116442550becSJunchao Zhang   if (imode == INSERT_VALUES) {
116542550becSJunchao Zhang     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
116642550becSJunchao Zhang   }
116742550becSJunchao Zhang 
116842550becSJunchao Zhang   if (akok->coo_has_repeats) {
116942550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
117042550becSJunchao Zhang       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
117142550becSJunchao Zhang     });
117242550becSJunchao Zhang   } else {
117342550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
117442550becSJunchao Zhang   }
117542550becSJunchao Zhang   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
117642550becSJunchao Zhang   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
117742550becSJunchao Zhang   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
117842550becSJunchao Zhang   PetscFunctionReturn(0);
117942550becSJunchao Zhang }
118042550becSJunchao Zhang 
118186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11828f7e8f9dSMark Adams {
11838f7e8f9dSMark Adams   PetscErrorCode ierr;
11848f7e8f9dSMark Adams 
11858f7e8f9dSMark Adams   PetscFunctionBegin;
11868f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
11878f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
11888f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11898f7e8f9dSMark Adams   PetscFunctionReturn(0);
11908f7e8f9dSMark Adams }
11918f7e8f9dSMark Adams 
11928c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
11938c3ff71bSJunchao Zhang {
119442550becSJunchao Zhang   PetscErrorCode     ierr;
1195076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1196076ba34aSJunchao Zhang 
11978c3ff71bSJunchao Zhang   PetscFunctionBegin;
1198076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
11996f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
12006f3d89d0SStefano Zampini 
12018c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
12028c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
12038c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1204a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1205f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1206a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1207076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
12088c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
12098c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
12108c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
12118c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
12128c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
12138c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1214076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
12150ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1216152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1217076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1218076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1219076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1220076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1221076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1222076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
122342550becSJunchao Zhang 
122442550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
122542550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1226076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1227076ba34aSJunchao Zhang }
1228076ba34aSJunchao Zhang 
1229076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1230076ba34aSJunchao Zhang {
1231076ba34aSJunchao Zhang   PetscErrorCode ierr;
1232076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1233076ba34aSJunchao Zhang   PetscInt       i,m,n;
1234076ba34aSJunchao Zhang 
1235076ba34aSJunchao Zhang   PetscFunctionBegin;
12362c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1237076ba34aSJunchao Zhang 
1238076ba34aSJunchao Zhang   m    = akok->nrows();
1239076ba34aSJunchao Zhang   n    = akok->ncols();
1240076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1241076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1242076ba34aSJunchao Zhang 
1243076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1244076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1245076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1246076ba34aSJunchao Zhang 
1247076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1248076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1249076ba34aSJunchao Zhang 
1250076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1251076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1252076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1253076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1254076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1255076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1256076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1257076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1258076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1259076ba34aSJunchao Zhang 
1260076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1261076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1262076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1263076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1264076ba34aSJunchao Zhang   }
1265076ba34aSJunchao Zhang 
1266076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1267076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1268ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1269076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1270076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1271076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1272076ba34aSJunchao Zhang }
1273076ba34aSJunchao Zhang 
1274076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1275076ba34aSJunchao Zhang 
1276076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1277076ba34aSJunchao Zhang  */
1278076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1279076ba34aSJunchao Zhang {
1280076ba34aSJunchao Zhang   PetscErrorCode ierr;
1281076ba34aSJunchao Zhang 
1282076ba34aSJunchao Zhang   PetscFunctionBegin;
1283076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1284076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
12858c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12868c3ff71bSJunchao Zhang }
12878c3ff71bSJunchao Zhang 
12888c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1289152b3e56SJunchao Zhang /*@C
12908c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12918c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12928c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
12938c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
12948c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
12958c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
12968c3ff71bSJunchao Zhang 
12978c3ff71bSJunchao Zhang    Collective
12988c3ff71bSJunchao Zhang 
12998c3ff71bSJunchao Zhang    Input Parameters:
13008c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
13018c3ff71bSJunchao Zhang .  m - number of rows
13028c3ff71bSJunchao Zhang .  n - number of columns
13038c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
13048c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
13058c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
13068c3ff71bSJunchao Zhang 
13078c3ff71bSJunchao Zhang    Output Parameter:
13088c3ff71bSJunchao Zhang .  A - the matrix
13098c3ff71bSJunchao Zhang 
13108c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
13118c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
13128c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
13138c3ff71bSJunchao Zhang 
13148c3ff71bSJunchao Zhang    Notes:
13158c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
13168c3ff71bSJunchao Zhang 
13178c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
13188c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
13198c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
13208c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
13218c3ff71bSJunchao Zhang 
13228c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
13238c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
13248c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
13258c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
13268c3ff71bSJunchao Zhang 
13278c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
13288c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
13298c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
13308c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
13318c3ff71bSJunchao Zhang 
13328c3ff71bSJunchao Zhang    Level: intermediate
13338c3ff71bSJunchao Zhang 
1334076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
13358c3ff71bSJunchao Zhang @*/
13368c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
13378c3ff71bSJunchao Zhang {
13388c3ff71bSJunchao Zhang   PetscErrorCode ierr;
13398c3ff71bSJunchao Zhang 
13408c3ff71bSJunchao Zhang   PetscFunctionBegin;
13418c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
13428c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
13438c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
13448c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
13458c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
13468c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13478c3ff71bSJunchao Zhang }
1348930e68a5SMark Adams 
13498f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
13508f7e8f9dSMark Adams //
135146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
13528f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
13538f7e8f9dSMark Adams //
13548f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1355930e68a5SMark Adams {
13568f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
13578f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13588f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1359930e68a5SMark Adams   PetscErrorCode     ierr;
13608f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1361300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1362076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1363076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
13648f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
136546804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1366930e68a5SMark Adams 
1367930e68a5SMark Adams   PetscFunctionBegin;
13682c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
13698f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
13702c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
13718f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
13728f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
13738f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
13748f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
13758f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
13768f7e8f9dSMark Adams   {
13778f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13788f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13798f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
13808f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13818f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13828f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13838f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13848f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13858f7e8f9dSMark Adams     size_t flops_h = 0.0;
13868f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13878f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13888f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13898f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13908f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13918f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13928f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
13938f7e8f9dSMark Adams     // Fill A --> fact
13948f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1395042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
13968f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
13978f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
13988f7e8f9dSMark Adams         // zero rows of B
13998f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14008f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
14018f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
14028f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
14038f7e8f9dSMark Adams             /* zero (unfactored row) */
14048f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
14058f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
14068f7e8f9dSMark Adams           });
14078f7e8f9dSMark Adams         // copy A into B
14088f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14098f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
14108f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
14118f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
14128f7e8f9dSMark Adams             /* load in initial (unfactored row) */
14138f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
14148f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
14158f7e8f9dSMark Adams               PetscScalar vala = av[j];
14168f7e8f9dSMark Adams               if (colb == rowb) {
14178f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
14188f7e8f9dSMark Adams               } else {
14198f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14208f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14218f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
14228f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
14238f7e8f9dSMark Adams                   if (pbj[j] == colb) {
14248f7e8f9dSMark Adams                     pba[j] = vala;
14258f7e8f9dSMark Adams                     set++;
14268f7e8f9dSMark Adams                     break;
14278f7e8f9dSMark Adams                   }
14288f7e8f9dSMark Adams                 }
14298f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
14308f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
14318f1da0b2SJunchao Zhang                #endif
14328f7e8f9dSMark Adams               }
14338f7e8f9dSMark Adams             }
14348f7e8f9dSMark Adams           });
14358f7e8f9dSMark Adams       });
14368f7e8f9dSMark Adams     Kokkos::fence();
1437930e68a5SMark Adams 
14388f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
14398f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14408f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14418f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1442042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
14438f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
14448f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
14458f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
14468f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
14478f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
14488f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
14498f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
14508f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
14518f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
14528f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
14538f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
14548f7e8f9dSMark Adams               else {
14558f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
14568f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
14578f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
14588f7e8f9dSMark Adams                 size_t         st_idx;
14598f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
14608f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14618f7e8f9dSMark Adams                 // get column, there has got to be a better way
14628f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
14638f7e8f9dSMark Adams                     if (pjL[j] == ii) {
14648f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
14658f7e8f9dSMark Adams                       idx = j; // output
14668f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14678f7e8f9dSMark Adams                     }
14688f7e8f9dSMark Adams                 }, st_idx);
14698f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
14708f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
147199551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
147299551766SMark Adams #endif
147399551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14748f7e8f9dSMark Adams                 // U(i+1,:end)
14758f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
14768f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
14778f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
14788f7e8f9dSMark Adams                       if (col==myk) {
14798f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
14808f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14818f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14828f7e8f9dSMark Adams                       } else {
14838f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14848f7e8f9dSMark Adams                         PetscInt       high, low;
14858f7e8f9dSMark Adams                         const PetscInt *startj;
14868f7e8f9dSMark Adams                         if (col<myk) { // L
14878f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14888f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14898f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14908f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14918f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14928f7e8f9dSMark Adams                         } else {
14938f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
14948f7e8f9dSMark Adams                           start = ba_d + idx;
14958f7e8f9dSMark Adams                           startj= bj_d + idx;
14968f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
14978f7e8f9dSMark Adams                         }
14988f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
14998f7e8f9dSMark Adams                         low  = 0;
15008f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
15018f7e8f9dSMark Adams                         while (high-low > 5) {
15028f7e8f9dSMark Adams                           int t = (low+high)/2;
15038f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
15048f7e8f9dSMark Adams                           else                 low  = t;
15058f7e8f9dSMark Adams                         }
15068f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
15078f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
15088f7e8f9dSMark Adams                         }
15098f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
151099551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
151199551766SMark Adams #endif
15128f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
15138f7e8f9dSMark Adams                       }
15148f7e8f9dSMark Adams                     });
15158f7e8f9dSMark Adams               }
15168f7e8f9dSMark Adams             });
15178f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
15188f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
15198f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
15208f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
15218f7e8f9dSMark Adams       });
15228f7e8f9dSMark Adams     Kokkos::fence();
15238f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
15248f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
15258f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
15268f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
15278f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
15288f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
15298f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
15308f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
15318f7e8f9dSMark Adams             if (i < end) {
15328f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
15338f7e8f9dSMark Adams               *pv = 1.0/(*pv);
15348f7e8f9dSMark Adams             }
15358f7e8f9dSMark Adams           });
15368f7e8f9dSMark Adams       });
15378f7e8f9dSMark Adams   }
15388f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
15398f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
15408f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
15418f7e8f9dSMark Adams 
15428f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
15438f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
15448f7e8f9dSMark Adams   if (b->inode.size) {
15458f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
15468f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
15478f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
15488f7e8f9dSMark Adams   } else {
15498f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15508f7e8f9dSMark Adams   }
15518f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15528f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
15538f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15548f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15558f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15568f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15578f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15588f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15598f7e8f9dSMark Adams 
1560930e68a5SMark Adams   PetscFunctionReturn(0);
1561930e68a5SMark Adams }
1562930e68a5SMark Adams 
156386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1564930e68a5SMark Adams {
1565930e68a5SMark Adams   PetscErrorCode   ierr;
1566930e68a5SMark Adams 
1567930e68a5SMark Adams   PetscFunctionBegin;
1568930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
156986a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
157086a27549SJunchao Zhang   PetscFunctionReturn(0);
157186a27549SJunchao Zhang }
157286a27549SJunchao Zhang 
157386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
157486a27549SJunchao Zhang {
157586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
157686a27549SJunchao Zhang 
157786a27549SJunchao Zhang   PetscFunctionBegin;
157886a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
157986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
158086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
158186a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
158286a27549SJunchao Zhang   }
158386a27549SJunchao Zhang   PetscFunctionReturn(0);
158486a27549SJunchao Zhang }
158586a27549SJunchao Zhang 
158686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
158786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
158886a27549SJunchao Zhang {
158986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1590076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
159186a27549SJunchao Zhang 
159286a27549SJunchao Zhang   PetscFunctionBegin;
159386a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
159486a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1595076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
159686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1597076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1598076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
159986a27549SJunchao Zhang 
160086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1601076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1602076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1603076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
160486a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
160586a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
160686a27549SJunchao Zhang 
160786a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
160886a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
160986a27549SJunchao Zhang     */
1610076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1611076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
161286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
161386a27549SJunchao Zhang 
161486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
161586a27549SJunchao Zhang 
161686a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1617076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
161886a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1619076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1620076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
162186a27549SJunchao Zhang 
162286a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1623076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1624076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1625076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
162686a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
162786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
162886a27549SJunchao Zhang 
162986a27549SJunchao Zhang     /* Sort indices. See comments above */
1630076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1631076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
163286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
163386a27549SJunchao Zhang 
163486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
163586a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
163686a27549SJunchao Zhang   }
163786a27549SJunchao Zhang   PetscFunctionReturn(0);
163886a27549SJunchao Zhang }
163986a27549SJunchao Zhang 
164086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
164186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
164286a27549SJunchao Zhang {
164386a27549SJunchao Zhang   PetscErrorCode                 ierr;
164486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
164586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
164686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
164786a27549SJunchao Zhang 
164886a27549SJunchao Zhang   PetscFunctionBegin;
1649eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
165086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
165186a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
165286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
165386a27549SJunchao Zhang   /* Solve L tmpv = b */
16543f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
165586a27549SJunchao Zhang   /* Solve Ux = tmpv */
16563f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
165786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
165886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1659eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
166086a27549SJunchao Zhang   PetscFunctionReturn(0);
166186a27549SJunchao Zhang }
166286a27549SJunchao Zhang 
1663076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
166486a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
166586a27549SJunchao Zhang {
166686a27549SJunchao Zhang   PetscErrorCode                 ierr;
166786a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
166886a27549SJunchao Zhang   PetscScalarKokkosView          xv;
166986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
167086a27549SJunchao Zhang 
167186a27549SJunchao Zhang   PetscFunctionBegin;
1672eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
167386a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
167486a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
167586a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
167686a27549SJunchao Zhang   /* Solve U^T tmpv = b */
167786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
167886a27549SJunchao Zhang 
167986a27549SJunchao Zhang   /* Solve L^T x = tmpv */
168086a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
168186a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
168286a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1683eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
168486a27549SJunchao Zhang   PetscFunctionReturn(0);
168586a27549SJunchao Zhang }
168686a27549SJunchao Zhang 
168786a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
168886a27549SJunchao Zhang {
168986a27549SJunchao Zhang   PetscErrorCode                 ierr;
169086a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
169186a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
169286a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
169386a27549SJunchao Zhang 
169486a27549SJunchao Zhang   PetscFunctionBegin;
1695eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
169686a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1697076ba34aSJunchao Zhang 
1698076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1699076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1700076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1701076ba34aSJunchao Zhang 
1702076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
170386a27549SJunchao Zhang 
170486a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
170586a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
170686a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
170786a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
170886a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
170986a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
171086a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
171186a27549SJunchao Zhang 
171286a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
171386a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
171486a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1715eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1716eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
171786a27549SJunchao Zhang   PetscFunctionReturn(0);
171886a27549SJunchao Zhang }
171986a27549SJunchao Zhang 
172086a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
172186a27549SJunchao Zhang {
172286a27549SJunchao Zhang   PetscErrorCode                 ierr;
172386a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
172486a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
172586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
172686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
172786a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
172886a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
172986a27549SJunchao Zhang 
173086a27549SJunchao Zhang   PetscFunctionBegin;
173186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
173286a27549SJunchao Zhang   /* Rebuild factors */
173386a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
173486a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
173586a27549SJunchao Zhang 
173686a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
173786a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
173886a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
173986a27549SJunchao Zhang 
174086a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
174186a27549SJunchao Zhang 
174286a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
174386a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
174486a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
174586a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
174686a27549SJunchao Zhang 
174786a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1748076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1749076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1750076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
175186a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
175286a27549SJunchao Zhang 
175386a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
175486a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
175586a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
175686a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
175786a27549SJunchao Zhang 
175886a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
175986a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
176086a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
176186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
176286a27549SJunchao Zhang  #else
176386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
176486a27549SJunchao Zhang  #endif
176586a27549SJunchao Zhang 
176686a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
176786a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
176886a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
176986a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
177086a27549SJunchao Zhang 
177186a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
177286a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
177386a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
177486a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
177586a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
177686a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
177786a27549SJunchao Zhang 
177886a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
177986a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1780930e68a5SMark Adams   PetscFunctionReturn(0);
1781930e68a5SMark Adams }
1782930e68a5SMark Adams 
17838f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17848f7e8f9dSMark Adams {
17858f7e8f9dSMark Adams   PetscErrorCode   ierr;
17868f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17878f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1788930e68a5SMark Adams 
17898f7e8f9dSMark Adams   PetscFunctionBegin;
17908f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
17918f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17928f7e8f9dSMark Adams   // move B data into Kokkos
17938f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
17948f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
17958f7e8f9dSMark Adams   {
17968f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1797300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
17988f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1799300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1800300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
18018f7e8f9dSMark Adams     }
18028f7e8f9dSMark Adams   }
18038f7e8f9dSMark Adams   PetscFunctionReturn(0);
18048f7e8f9dSMark Adams }
18058f7e8f9dSMark Adams 
180686a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1807930e68a5SMark Adams {
1808930e68a5SMark Adams   PetscFunctionBegin;
1809930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1810930e68a5SMark Adams   PetscFunctionReturn(0);
1811930e68a5SMark Adams }
1812930e68a5SMark Adams 
18138f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
18148f7e8f9dSMark Adams {
18158f7e8f9dSMark Adams   PetscFunctionBegin;
18168f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
18178f7e8f9dSMark Adams   PetscFunctionReturn(0);
18188f7e8f9dSMark Adams }
18198f7e8f9dSMark Adams 
1820930e68a5SMark Adams /*MC
182186a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
182286a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1823930e68a5SMark Adams 
1824930e68a5SMark Adams   Level: beginner
1825930e68a5SMark Adams 
182686a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1827930e68a5SMark Adams M*/
182886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1829930e68a5SMark Adams {
1830930e68a5SMark Adams   PetscErrorCode ierr;
1831930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1832930e68a5SMark Adams 
1833930e68a5SMark Adams   PetscFunctionBegin;
1834930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1835930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1836930e68a5SMark Adams   (*B)->factortype = ftype;
18374ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1838930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1839930e68a5SMark Adams 
18408f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1841930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
184286a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
184386a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
184486a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
184586a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
184686a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
184786a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
184898921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1849930e68a5SMark Adams 
1850930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
185186a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1852930e68a5SMark Adams   PetscFunctionReturn(0);
1853930e68a5SMark Adams }
18548f7e8f9dSMark Adams 
18558f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
18568f7e8f9dSMark Adams {
18578f7e8f9dSMark Adams   PetscErrorCode ierr;
18588f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
18598f7e8f9dSMark Adams 
18608f7e8f9dSMark Adams   PetscFunctionBegin;
18618f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
18628f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
18638f7e8f9dSMark Adams   (*B)->factortype = ftype;
1864f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
18654ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
18668f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
18678f7e8f9dSMark Adams 
18688f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18698f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
18708f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
18718f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
18728f7e8f9dSMark Adams 
18738f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
18748f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
18758f7e8f9dSMark Adams   PetscFunctionReturn(0);
18768f7e8f9dSMark Adams }
187786a27549SJunchao Zhang 
187886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
187986a27549SJunchao Zhang {
188086a27549SJunchao Zhang   PetscErrorCode ierr;
188186a27549SJunchao Zhang 
188286a27549SJunchao Zhang   PetscFunctionBegin;
188386a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
188486a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
188586a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
188686a27549SJunchao Zhang   PetscFunctionReturn(0);
188786a27549SJunchao Zhang }
188886a27549SJunchao Zhang 
1889076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1890076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1891076ba34aSJunchao Zhang {
1892076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1893076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1894076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1895076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1896076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1897076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1898076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1899076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1900076ba34aSJunchao Zhang 
1901076ba34aSJunchao Zhang   PetscFunctionBegin;
1902c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1903076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1904c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1905076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1906c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1907076ba34aSJunchao Zhang     }
1908076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1909076ba34aSJunchao Zhang   }
1910076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1911076ba34aSJunchao Zhang }
1912