xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 8f1da0b2fa678aa93b3ff8fcae317c7440bcb6ad)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
1942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
208c3ff71bSJunchao Zhang 
218c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
228c3ff71bSJunchao Zhang 
23076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
24076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
25076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
26076ba34aSJunchao Zhang  */
278c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
288c3ff71bSJunchao Zhang {
298c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
30076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
31076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
328c3ff71bSJunchao Zhang 
338c3ff71bSJunchao Zhang   PetscFunctionBegin;
34076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
358c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
36076ba34aSJunchao Zhang 
37076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
38076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
39076ba34aSJunchao Zhang 
40076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
41076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
42076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
43076ba34aSJunchao Zhang   */
44076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
45076ba34aSJunchao Zhang     delete aijkok;
46076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
47076ba34aSJunchao Zhang     A->spptr = aijkok;
48076ba34aSJunchao Zhang   }
49076ba34aSJunchao Zhang 
50a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
51a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
52a587d139SMark   }
538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
548c3ff71bSJunchao Zhang }
558c3ff71bSJunchao Zhang 
5686a27549SJunchao Zhang /* Sync CSR data to device if not yet */
57076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
588c3ff71bSJunchao Zhang {
598c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
608c3ff71bSJunchao Zhang 
618c3ff71bSJunchao Zhang   PetscFunctionBegin;
6286a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
63076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
64076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
65076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
66076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
67076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6886a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
698c3ff71bSJunchao Zhang   }
708c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
718c3ff71bSJunchao Zhang }
728c3ff71bSJunchao Zhang 
73076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
74fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7586a27549SJunchao Zhang {
7686a27549SJunchao Zhang   PetscErrorCode   ierr;
7786a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7886a27549SJunchao Zhang 
7986a27549SJunchao Zhang   PetscFunctionBegin;
8086a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8186a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8286a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8386a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
852328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8786a27549SJunchao Zhang   PetscFunctionReturn(0);
8886a27549SJunchao Zhang }
8986a27549SJunchao Zhang 
90f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
91f0cf5187SStefano Zampini {
92f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
93f0cf5187SStefano Zampini 
94f0cf5187SStefano Zampini   PetscFunctionBegin;
95f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9686a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9786a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
98f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
99076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
100f0cf5187SStefano Zampini   PetscFunctionReturn(0);
101f0cf5187SStefano Zampini }
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
104f0cf5187SStefano Zampini {
105076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
106f0cf5187SStefano Zampini 
107f0cf5187SStefano Zampini   PetscFunctionBegin;
108076ba34aSJunchao Zhang   if (aijkok) {
109076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
110076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
111076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
112076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
113076ba34aSJunchao Zhang   }
114076ba34aSJunchao Zhang   PetscFunctionReturn(0);
115076ba34aSJunchao Zhang }
116076ba34aSJunchao Zhang 
117076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
118076ba34aSJunchao Zhang {
119076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
120076ba34aSJunchao Zhang 
121076ba34aSJunchao Zhang   PetscFunctionBegin;
122076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
123076ba34aSJunchao Zhang   PetscFunctionReturn(0);
124076ba34aSJunchao Zhang }
125076ba34aSJunchao Zhang 
126076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
127076ba34aSJunchao Zhang {
128076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
129076ba34aSJunchao Zhang 
130076ba34aSJunchao Zhang   PetscFunctionBegin;
1312328674fSJunchao Zhang   if (aijkok) {
132076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
133076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1342328674fSJunchao Zhang   } else {
1352328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1362328674fSJunchao Zhang   }
137076ba34aSJunchao Zhang   PetscFunctionReturn(0);
138076ba34aSJunchao Zhang }
139076ba34aSJunchao Zhang 
140076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
141076ba34aSJunchao Zhang {
142076ba34aSJunchao Zhang   PetscFunctionBegin;
143076ba34aSJunchao Zhang   *array = NULL;
144076ba34aSJunchao Zhang   PetscFunctionReturn(0);
145076ba34aSJunchao Zhang }
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
148076ba34aSJunchao Zhang {
149076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
150076ba34aSJunchao Zhang 
151076ba34aSJunchao Zhang   PetscFunctionBegin;
1522328674fSJunchao Zhang   if (aijkok) {
153076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1542328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1552328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1562328674fSJunchao Zhang   }
157076ba34aSJunchao Zhang   PetscFunctionReturn(0);
158076ba34aSJunchao Zhang }
159076ba34aSJunchao Zhang 
160076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
161076ba34aSJunchao Zhang {
162076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
163076ba34aSJunchao Zhang 
164076ba34aSJunchao Zhang   PetscFunctionBegin;
1652328674fSJunchao Zhang   if (aijkok) {
166076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
167076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1682328674fSJunchao Zhang   }
169f0cf5187SStefano Zampini   PetscFunctionReturn(0);
170f0cf5187SStefano Zampini }
171f0cf5187SStefano Zampini 
172a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
173042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
174a587d139SMark {
175042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
176042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
179076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
180152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
181a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
182a587d139SMark   PetscFunctionReturn(0);
183a587d139SMark }
184a587d139SMark 
185a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
187a587d139SMark {
188042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
189a587d139SMark 
190a587d139SMark   PetscFunctionBegin;
191a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
192a587d139SMark     *d_mat = aijkok->device_mat_d.data();
193a587d139SMark   } else {
194a587d139SMark     PetscErrorCode   ierr;
195a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
196a587d139SMark     *d_mat  = NULL;
197a587d139SMark   }
198a587d139SMark   PetscFunctionReturn(0);
199a587d139SMark }
200076ba34aSJunchao Zhang 
201076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
202076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
203152b3e56SJunchao Zhang {
204152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
205152b3e56SJunchao Zhang 
206152b3e56SJunchao Zhang   PetscFunctionBegin;
207076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
208076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
209076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
211076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
212076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21386a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
214076ba34aSJunchao Zhang   }
215076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
216152b3e56SJunchao Zhang   PetscFunctionReturn(0);
217152b3e56SJunchao Zhang }
218152b3e56SJunchao Zhang 
219076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
220076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
221152b3e56SJunchao Zhang {
222eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
223152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
224152b3e56SJunchao Zhang 
225152b3e56SJunchao Zhang   PetscFunctionBegin;
226eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
227076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
228076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
229076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
230076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
231076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
232076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
233076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
234076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
235076ba34aSJunchao Zhang    #endif
23686a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
237076ba34aSJunchao Zhang   }
238076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
239eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
240152b3e56SJunchao Zhang   PetscFunctionReturn(0);
241152b3e56SJunchao Zhang }
242a587d139SMark 
2438c3ff71bSJunchao Zhang /* y = A x */
2448c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2458c3ff71bSJunchao Zhang {
2468c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2478c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
248152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
249152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2508c3ff71bSJunchao Zhang 
2518c3ff71bSJunchao Zhang   PetscFunctionBegin;
2528c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
254152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
256eeadb341SJunchao Zhang   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
257152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
258152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
259152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
260076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
261152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2638c3ff71bSJunchao Zhang }
2648c3ff71bSJunchao Zhang 
2658c3ff71bSJunchao Zhang /* y = A^T x */
2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2678c3ff71bSJunchao Zhang {
2688c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2698c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
270152b3e56SJunchao Zhang   const char                       *mode;
271152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
272152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
273076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang   PetscFunctionBegin;
276eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
278152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
279152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
280152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
281076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
282152b3e56SJunchao Zhang     mode = "N";
283152b3e56SJunchao Zhang   } else {
284076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
286152b3e56SJunchao Zhang     mode = "T";
287152b3e56SJunchao Zhang   }
288076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
289152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
290152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
291076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
292eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2938c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2948c3ff71bSJunchao Zhang }
2958c3ff71bSJunchao Zhang 
2968c3ff71bSJunchao Zhang /* y = A^H x */
2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2988c3ff71bSJunchao Zhang {
2998c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3008c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
301152b3e56SJunchao Zhang   const char                       *mode;
302152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
303152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
304076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3058c3ff71bSJunchao Zhang 
3068c3ff71bSJunchao Zhang   PetscFunctionBegin;
307eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
309152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
311152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
312076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
313152b3e56SJunchao Zhang     mode = "N";
314152b3e56SJunchao Zhang   } else {
315076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
316076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
317152b3e56SJunchao Zhang     mode = "C";
318152b3e56SJunchao Zhang   }
319076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
320152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
321152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
322076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
323eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3258c3ff71bSJunchao Zhang }
3268c3ff71bSJunchao Zhang 
3278c3ff71bSJunchao Zhang /* z = A x + y */
3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3298c3ff71bSJunchao Zhang {
3308c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3318c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
332152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
333152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3348c3ff71bSJunchao Zhang 
3358c3ff71bSJunchao Zhang   PetscFunctionBegin;
336eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3378c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
340152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3418c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3428c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
343152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3508c3ff71bSJunchao Zhang }
3518c3ff71bSJunchao Zhang 
3528c3ff71bSJunchao Zhang /* z = A^T x + y */
3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3548c3ff71bSJunchao Zhang {
3558c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3568c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
357152b3e56SJunchao Zhang   const char                       *mode;
358152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
359152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
360076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3618c3ff71bSJunchao Zhang 
3628c3ff71bSJunchao Zhang   PetscFunctionBegin;
363eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
366152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
367152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3688c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
369152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
370076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
371152b3e56SJunchao Zhang     mode = "N";
372152b3e56SJunchao Zhang   } else {
373076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
374076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
375152b3e56SJunchao Zhang     mode = "T";
376152b3e56SJunchao Zhang   }
377076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
380152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
381076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
382eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3838c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3848c3ff71bSJunchao Zhang }
3858c3ff71bSJunchao Zhang 
3868c3ff71bSJunchao Zhang /* z = A^H x + y */
3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3888c3ff71bSJunchao Zhang {
3898c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3908c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
391152b3e56SJunchao Zhang   const char                       *mode;
392152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
393152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
394076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3958c3ff71bSJunchao Zhang 
3968c3ff71bSJunchao Zhang   PetscFunctionBegin;
397eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3988c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
400152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
401152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
4028c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
403152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
404076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
405152b3e56SJunchao Zhang     mode = "N";
406152b3e56SJunchao Zhang   } else {
407076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
409152b3e56SJunchao Zhang     mode = "C";
410152b3e56SJunchao Zhang   }
411076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
413152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
414152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
415076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
416eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
417152b3e56SJunchao Zhang   PetscFunctionReturn(0);
418152b3e56SJunchao Zhang }
419152b3e56SJunchao Zhang 
420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
421152b3e56SJunchao Zhang {
422152b3e56SJunchao Zhang   PetscErrorCode            ierr;
423152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
424152b3e56SJunchao Zhang 
425152b3e56SJunchao Zhang   PetscFunctionBegin;
426152b3e56SJunchao Zhang   switch (op) {
427152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
428152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
429152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
430152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
431152b3e56SJunchao Zhang       break;
432152b3e56SJunchao Zhang     default:
433152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
434152b3e56SJunchao Zhang       break;
435152b3e56SJunchao Zhang   }
4368c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4378c3ff71bSJunchao Zhang }
4388c3ff71bSJunchao Zhang 
439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4418c3ff71bSJunchao Zhang {
4428c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
443076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4448c3ff71bSJunchao Zhang 
4458c3ff71bSJunchao Zhang   PetscFunctionBegin;
446a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
447076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
448076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4498c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
450076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
451076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
452076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
453076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
454076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
455076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
457076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
458076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
459076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
460076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4618c3ff71bSJunchao Zhang     }
462076ba34aSJunchao Zhang   }
4638c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4648c3ff71bSJunchao Zhang }
4658c3ff71bSJunchao Zhang 
466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
467076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
468076ba34aSJunchao Zhang  */
469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4708c3ff71bSJunchao Zhang {
4718c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
472076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
473076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
474076ba34aSJunchao Zhang   Mat                   mat;
4758c3ff71bSJunchao Zhang 
4768c3ff71bSJunchao Zhang   PetscFunctionBegin;
477076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
478076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
479076ba34aSJunchao Zhang   mat  = *B;
480076ba34aSJunchao Zhang   if (A->assembled) {
481076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
482076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
483076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
484076ba34aSJunchao Zhang     /* Now copy values to B if needed */
485076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
486076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
487076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
488076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
489076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
490076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
491076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
492076ba34aSJunchao Zhang       }
493076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
494076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
495076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
496076ba34aSJunchao Zhang     }
497076ba34aSJunchao Zhang     mat->spptr = bkok;
498076ba34aSJunchao Zhang   }
499076ba34aSJunchao Zhang 
500076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
501076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
502076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
503076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5048c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5058c3ff71bSJunchao Zhang }
5068c3ff71bSJunchao Zhang 
5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5080ecb592aSJunchao Zhang {
5090ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5100ecb592aSJunchao Zhang   Mat               At;
5110ecb592aSJunchao Zhang   KokkosCsrMatrix   *internT,*csrmatT;
5120ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5130ecb592aSJunchao Zhang 
5140ecb592aSJunchao Zhang   PetscFunctionBegin;
5150ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5160ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
5170ecb592aSJunchao Zhang     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
5180ecb592aSJunchao Zhang     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
5190ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5200ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5217a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5220ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5230ecb592aSJunchao Zhang     if ((*B)->assembled) {
5240ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5250ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5260ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5270ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5280ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5290ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5300ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5320ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5330ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5340ecb592aSJunchao Zhang   }
5350ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5360ecb592aSJunchao Zhang }
5370ecb592aSJunchao Zhang 
5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5398c3ff71bSJunchao Zhang {
5408c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5428c3ff71bSJunchao Zhang 
5438c3ff71bSJunchao Zhang   PetscFunctionBegin;
54486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5468f7e8f9dSMark Adams     if (aijkok) {
5478f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
548a587d139SMark         delete aijkok->colmap_d;
549a587d139SMark         delete aijkok->i_uncompressed_d;
550a587d139SMark       }
5518f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5528f7e8f9dSMark Adams     }
5538c3ff71bSJunchao Zhang     delete aijkok;
55486a27549SJunchao Zhang   } else {
55586a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
55686a27549SJunchao Zhang   }
557152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
55842550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
55942550becSJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       NULL);CHKERRQ(ierr);
560152b3e56SJunchao Zhang   A->spptr = NULL;
5618c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5638c3ff71bSJunchao Zhang }
5648c3ff71bSJunchao Zhang 
56586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
56686a27549SJunchao Zhang {
56786a27549SJunchao Zhang   PetscErrorCode ierr;
56886a27549SJunchao Zhang 
56986a27549SJunchao Zhang   PetscFunctionBegin;
57086a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
57186a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
57286a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
57386a27549SJunchao Zhang   PetscFunctionReturn(0);
57486a27549SJunchao Zhang }
57586a27549SJunchao Zhang 
576076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
577076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
578a3f881fbSStefano Zampini {
579076ba34aSJunchao Zhang   PetscErrorCode               ierr;
580076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
581076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
582076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
583076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
584076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
585076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
586a3f881fbSStefano Zampini 
587a3f881fbSStefano Zampini   PetscFunctionBegin;
588076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
589076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
590076ba34aSJunchao Zhang   PetscValidPointer(C,4);
591076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
592076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
593c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
594076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
595076ba34aSJunchao Zhang 
596076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
597076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
598076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
599076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
600076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
601076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
602076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
603076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
604076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
605076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
606076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
607076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
608076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
609076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
610076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
611076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
612076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
613076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
614076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
615076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
616076ba34aSJunchao Zhang     ca = ca_dual.view_device();
617076ba34aSJunchao Zhang     ci = ci_dual.view_device();
618076ba34aSJunchao Zhang     cj = cj_dual.view_device();
619076ba34aSJunchao Zhang 
620076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
621076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
622076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
623076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
624076ba34aSJunchao Zhang 
625076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
626076ba34aSJunchao Zhang         ci(i) = coffset;
627076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
628076ba34aSJunchao Zhang       });
629076ba34aSJunchao Zhang 
630076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
631076ba34aSJunchao Zhang         if (k < alen) {
632076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
633076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
634076ba34aSJunchao Zhang         } else {
635076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
636076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
637076ba34aSJunchao Zhang         }
638076ba34aSJunchao Zhang       });
639076ba34aSJunchao Zhang     });
640076ba34aSJunchao Zhang     ca_dual.modify_device();
641076ba34aSJunchao Zhang     ci_dual.modify_device();
642076ba34aSJunchao Zhang     cj_dual.modify_device();
643076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
644076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
645076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
646076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
647076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
648076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
649076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
650076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
651076ba34aSJunchao Zhang 
652076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
653076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
654076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
655076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
656076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
657076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
658076ba34aSJunchao Zhang       });
659076ba34aSJunchao Zhang     });
660076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
661076ba34aSJunchao Zhang   }
662076ba34aSJunchao Zhang   PetscFunctionReturn(0);
663076ba34aSJunchao Zhang }
664076ba34aSJunchao Zhang 
665076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
666076ba34aSJunchao Zhang {
667076ba34aSJunchao Zhang   PetscFunctionBegin;
668076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
669a3f881fbSStefano Zampini   PetscFunctionReturn(0);
670a3f881fbSStefano Zampini }
671a3f881fbSStefano Zampini 
672a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
673a3f881fbSStefano Zampini {
674076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
675a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
676a3f881fbSStefano Zampini   Mat                            A,B;
677076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
678a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
679a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
680076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
681076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
682a3f881fbSStefano Zampini 
683a3f881fbSStefano Zampini   PetscFunctionBegin;
684a3f881fbSStefano Zampini   MatCheckProduct(C,1);
685a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
686076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
687076ba34aSJunchao Zhang 
688076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
689076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
690076ba34aSJunchao Zhang     PetscFunctionReturn(0);
691076ba34aSJunchao Zhang   }
692076ba34aSJunchao Zhang 
693076ba34aSJunchao Zhang   switch (product->type) {
694076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
695076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
696076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
697076ba34aSJunchao Zhang     default:
698076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
699076ba34aSJunchao Zhang   }
700076ba34aSJunchao Zhang 
701a3f881fbSStefano Zampini   A     = product->A;
702a3f881fbSStefano Zampini   B     = product->B;
703a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
704a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
705a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
706a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
707a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
708076ba34aSJunchao Zhang 
709076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
710076ba34aSJunchao Zhang 
711076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
712076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
713076ba34aSJunchao Zhang 
714076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
715076ba34aSJunchao Zhang   if (transA) {
716076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
717076ba34aSJunchao Zhang     transA = false;
718a3f881fbSStefano Zampini   }
719a3f881fbSStefano Zampini 
720076ba34aSJunchao Zhang   if (transB) {
721076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
722076ba34aSJunchao Zhang     transB = false;
723076ba34aSJunchao Zhang   }
724eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
725076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
726076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
727eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
728076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
729a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
730a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
731c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
732a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
733c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
734a3f881fbSStefano Zampini   c->reallocs         = 0;
735076ba34aSJunchao Zhang   C->info.mallocs     = 0;
736a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
737a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
738a3f881fbSStefano Zampini   C->num_ass++;
739a3f881fbSStefano Zampini   PetscFunctionReturn(0);
740a3f881fbSStefano Zampini }
741a3f881fbSStefano Zampini 
742a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
743a3f881fbSStefano Zampini {
744a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
745076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
746076ba34aSJunchao Zhang   MatProductType                 ptype;
747076ba34aSJunchao Zhang   Mat                            A,B;
748076ba34aSJunchao Zhang   bool                           transA,transB;
749076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
750076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
751076ba34aSJunchao Zhang   MPI_Comm                       comm;
752076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
753a3f881fbSStefano Zampini 
754a3f881fbSStefano Zampini   PetscFunctionBegin;
755a3f881fbSStefano Zampini   MatCheckProduct(C,1);
756076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
757076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
758a3f881fbSStefano Zampini   A       = product->A;
759a3f881fbSStefano Zampini   B       = product->B;
760a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
761a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
762a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
763a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
764076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
765076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
766076ba34aSJunchao Zhang 
767a3f881fbSStefano Zampini   ptype   = product->type;
768a3f881fbSStefano Zampini   switch (ptype) {
769076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
770076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
771076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
772a3f881fbSStefano Zampini     default:
773076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
774a3f881fbSStefano Zampini   }
775a3f881fbSStefano Zampini 
776076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
777076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
778076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
779076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
780a3f881fbSStefano Zampini 
781076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
782076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7836ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7846ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7856ffb9508SJunchao Zhang    */
786076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
787076ba34aSJunchao Zhang 
788eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
789076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
790076ba34aSJunchao Zhang   if (transA) {
791076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
792076ba34aSJunchao Zhang     transA = false;
793076ba34aSJunchao Zhang   }
794076ba34aSJunchao Zhang 
795076ba34aSJunchao Zhang   if (transB) {
796076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
797076ba34aSJunchao Zhang     transB = false;
798076ba34aSJunchao Zhang   }
799076ba34aSJunchao Zhang 
800076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
801076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
802076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
803076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
804076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
805076ba34aSJunchao Zhang   */
806076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
807076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
808eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
809076ba34aSJunchao Zhang 
810076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
811076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
812076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
813a3f881fbSStefano Zampini   PetscFunctionReturn(0);
814a3f881fbSStefano Zampini }
815a3f881fbSStefano Zampini 
816a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
817076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
818a3f881fbSStefano Zampini {
819a3f881fbSStefano Zampini   PetscErrorCode ierr;
820076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
821a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
822a3f881fbSStefano Zampini 
823a3f881fbSStefano Zampini   PetscFunctionBegin;
824a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
825a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
826a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
827a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
828a3f881fbSStefano Zampini   }
829a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
830a3f881fbSStefano Zampini     switch (product->type) {
831a3f881fbSStefano Zampini     case MATPRODUCT_AB:
832a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
833a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
834a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
835a3f881fbSStefano Zampini       break;
836a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
837a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
838a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
839a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
840a3f881fbSStefano Zampini       break;
841a3f881fbSStefano Zampini     default:
842a3f881fbSStefano Zampini       break;
843a3f881fbSStefano Zampini     }
844a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
845a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
846a3f881fbSStefano Zampini   }
847a3f881fbSStefano Zampini   PetscFunctionReturn(0);
848a3f881fbSStefano Zampini }
849a587d139SMark 
850f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
851f0cf5187SStefano Zampini {
852f0cf5187SStefano Zampini   PetscErrorCode   ierr;
853f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
854f0cf5187SStefano Zampini 
855f0cf5187SStefano Zampini   PetscFunctionBegin;
856eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
857f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
858f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
859076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
860076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
861076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
862eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
863f0cf5187SStefano Zampini   PetscFunctionReturn(0);
864f0cf5187SStefano Zampini }
865f0cf5187SStefano Zampini 
866a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
867a587d139SMark {
868a587d139SMark   PetscErrorCode   ierr;
869076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
870a587d139SMark 
871a587d139SMark   PetscFunctionBegin;
872076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8732328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
874076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
875076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8762328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8772328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8782328674fSJunchao Zhang   }
879a587d139SMark   PetscFunctionReturn(0);
880a587d139SMark }
881a587d139SMark 
882db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
88342550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
884db78de30SJunchao Zhang {
885db78de30SJunchao Zhang   PetscErrorCode     ierr;
886db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
887db78de30SJunchao Zhang 
888db78de30SJunchao Zhang   PetscFunctionBegin;
889db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
890db78de30SJunchao Zhang   PetscValidPointer(kv,2);
891db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
892db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
893db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
894076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
895db78de30SJunchao Zhang   PetscFunctionReturn(0);
896db78de30SJunchao Zhang }
897db78de30SJunchao Zhang 
89842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
899db78de30SJunchao Zhang {
900db78de30SJunchao Zhang   PetscFunctionBegin;
901db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
902db78de30SJunchao Zhang   PetscValidPointer(kv,2);
903db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
904db78de30SJunchao Zhang   PetscFunctionReturn(0);
905db78de30SJunchao Zhang }
906db78de30SJunchao Zhang 
90742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
908db78de30SJunchao Zhang {
909db78de30SJunchao Zhang   PetscErrorCode     ierr;
910db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
911db78de30SJunchao Zhang 
912db78de30SJunchao Zhang   PetscFunctionBegin;
913db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
914db78de30SJunchao Zhang   PetscValidPointer(kv,2);
915db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
916db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
917db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
918076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
919db78de30SJunchao Zhang   PetscFunctionReturn(0);
920db78de30SJunchao Zhang }
921db78de30SJunchao Zhang 
92242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
923db78de30SJunchao Zhang {
924db78de30SJunchao Zhang   PetscErrorCode     ierr;
925db78de30SJunchao Zhang 
926db78de30SJunchao Zhang   PetscFunctionBegin;
927db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928db78de30SJunchao Zhang   PetscValidPointer(kv,2);
929db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
931db78de30SJunchao Zhang   PetscFunctionReturn(0);
932db78de30SJunchao Zhang }
933db78de30SJunchao Zhang 
93442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
935db78de30SJunchao Zhang {
936db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
937db78de30SJunchao Zhang 
938db78de30SJunchao Zhang   PetscFunctionBegin;
939db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
940db78de30SJunchao Zhang   PetscValidPointer(kv,2);
941db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
942db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
943076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
944db78de30SJunchao Zhang   PetscFunctionReturn(0);
945db78de30SJunchao Zhang }
946db78de30SJunchao Zhang 
94742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
948db78de30SJunchao Zhang {
949db78de30SJunchao Zhang   PetscErrorCode     ierr;
950db78de30SJunchao Zhang 
951db78de30SJunchao Zhang   PetscFunctionBegin;
952db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
953db78de30SJunchao Zhang   PetscValidPointer(kv,2);
954db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
955076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
956db78de30SJunchao Zhang   PetscFunctionReturn(0);
957db78de30SJunchao Zhang }
958db78de30SJunchao Zhang 
959c17cf699SJunchao Zhang /* Computes Y += alpha X */
960c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
961a587d139SMark {
962a587d139SMark   PetscErrorCode             ierr;
963a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
964c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
965c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
966c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
967a587d139SMark 
968a587d139SMark   PetscFunctionBegin;
969c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
970c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
971c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
972c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
973eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
974db78de30SJunchao Zhang 
975c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
976c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
977a587d139SMark     PetscBool e;
978a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
979a587d139SMark     if (e) {
980a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
981c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
982a587d139SMark     }
983a587d139SMark   }
984db78de30SJunchao Zhang 
985c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
986c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
987c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
988c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
989c17cf699SJunchao Zhang   */
990c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
991c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
992c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
993c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
994c17cf699SJunchao Zhang 
995c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
996c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
997c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
998c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
999c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
1000c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
1001c17cf699SJunchao Zhang 
1002c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1003c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1004c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1005c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1006c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1007c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1008c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1009c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1010c17cf699SJunchao Zhang             q++;
1011a587d139SMark           } else {
1012c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1013c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1014c17cf699SJunchao Zhang             */
1015c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1016a587d139SMark           }
1017c17cf699SJunchao Zhang         }
1018c17cf699SJunchao Zhang       });
1019c17cf699SJunchao Zhang     });
1020c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1021c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1022c17cf699SJunchao Zhang     Mat             Z;
1023c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1024c17cf699SJunchao Zhang     KernelHandle    kh;
1025c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1026c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1027c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1028c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1029c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1030c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1031c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1032c17cf699SJunchao Zhang   }
1033eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1034eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1035a587d139SMark   PetscFunctionReturn(0);
1036a587d139SMark }
1037a587d139SMark 
103842550becSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
103942550becSJunchao Zhang {
104042550becSJunchao Zhang   PetscErrorCode            ierr;
104142550becSJunchao Zhang   MPI_Comm                  comm;
104242550becSJunchao Zhang   PetscInt                  *i,*j,*perm,*jmap;
104342550becSJunchao Zhang   PetscInt                  k,M,N,p,q,row,start,end,nnz,nneg;
104442550becSJunchao Zhang   PetscBool                 has_repeats = PETSC_FALSE;
104542550becSJunchao Zhang   PetscInt                  *Ai,*Aj;
104642550becSJunchao Zhang   PetscScalar               *Aa;
104742550becSJunchao Zhang   Mat_SeqAIJKokkos          *akok;
104842550becSJunchao Zhang   Mat_SeqAIJ                *aseq;
104942550becSJunchao Zhang   MatRowMapKokkosViewHost   perm_h("perm",n),jmap_h;
105042550becSJunchao Zhang   Mat                       newmat;
105142550becSJunchao Zhang 
105242550becSJunchao Zhang   PetscFunctionBegin;
105342550becSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr);
105442550becSJunchao Zhang   ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr);
105542550becSJunchao Zhang   ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr);
105642550becSJunchao Zhang   ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */
105742550becSJunchao Zhang   ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr);
105842550becSJunchao Zhang   perm = perm_h.data();
105942550becSJunchao Zhang   for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */
106042550becSJunchao Zhang     if (j[k] < 0) i[k] = -1;
106142550becSJunchao Zhang     perm[k] = k;
106242550becSJunchao Zhang   }
106342550becSJunchao Zhang 
106442550becSJunchao Zhang   /* Sort by row */
106542550becSJunchao Zhang   ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr);
106642550becSJunchao Zhang   for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */
106742550becSJunchao Zhang   nneg = k;
106842550becSJunchao Zhang   Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */
106942550becSJunchao Zhang   nnz  = 0; /* Total number of unique nonzeros to be counted */
107042550becSJunchao Zhang   jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */
107142550becSJunchao Zhang   jmap++; /* Inc jmap by 1 for convinience */
107242550becSJunchao Zhang 
107342550becSJunchao Zhang   ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */
107442550becSJunchao Zhang   ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */
107542550becSJunchao Zhang 
107642550becSJunchao Zhang   /* In each row, sort by column, then unique column indices to get row length */
107742550becSJunchao Zhang   Ai++; /* Inc by 1 for convinience */
107842550becSJunchao Zhang   q = 0; /* q-th unique nonzero, with q starting from 0 */
107942550becSJunchao Zhang   while (k<n) {
108042550becSJunchao Zhang     row   = i[k];
108142550becSJunchao Zhang     start = k; /* [start,end) indices for this row */
108242550becSJunchao Zhang     while (k<n && i[k] == row) k++;
108342550becSJunchao Zhang     end   = k;
108442550becSJunchao Zhang     ierr  = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr);
108542550becSJunchao Zhang     /* Find number of unique col entries in this row */
108642550becSJunchao Zhang     Aj[q]   = j[start]; /* Log the first nonzero in this row */
108742550becSJunchao Zhang     jmap[q] = 1; /* Number of repeats of this nozero entry */
108842550becSJunchao Zhang     Ai[row] = 1;
108942550becSJunchao Zhang     nnz++;
109042550becSJunchao Zhang 
109142550becSJunchao Zhang     for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */
109242550becSJunchao Zhang       if (j[p] != j[p-1]) { /* Meet a new nonzero */
109342550becSJunchao Zhang         q++;
109442550becSJunchao Zhang         jmap[q] = 1;
109542550becSJunchao Zhang         Aj[q]   = j[p];
109642550becSJunchao Zhang         Ai[row]++;
109742550becSJunchao Zhang         nnz++;
109842550becSJunchao Zhang       } else {
109942550becSJunchao Zhang         jmap[q]++;
110042550becSJunchao Zhang         has_repeats = PETSC_TRUE;
110142550becSJunchao Zhang       }
110242550becSJunchao Zhang     }
110342550becSJunchao Zhang     q++; /* Move to next row and thus next unique nonzero */
110442550becSJunchao Zhang   }
110542550becSJunchao Zhang   ierr = PetscFree2(i,j);CHKERRQ(ierr);
110642550becSJunchao Zhang 
110742550becSJunchao Zhang   Ai--; /* Back to the beginning of Ai[] */
110842550becSJunchao Zhang   for (k=0; k<M; k++) Ai[k+1] += Ai[k];
110942550becSJunchao Zhang   jmap--; /* Back to the beginning of jmap[] */
111042550becSJunchao Zhang   jmap[0] = 0;
111142550becSJunchao Zhang   if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */
111242550becSJunchao Zhang     for (k=0; k<nnz; k++) jmap[k+1] += jmap[k];
111342550becSJunchao Zhang     Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */
111442550becSJunchao Zhang   }
111542550becSJunchao Zhang 
111642550becSJunchao Zhang   if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */
111742550becSJunchao Zhang     PetscInt *Aj_new;
111842550becSJunchao Zhang     ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr);
111942550becSJunchao Zhang     ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr);
112042550becSJunchao Zhang     ierr = PetscFree(Aj);CHKERRQ(ierr);
112142550becSJunchao Zhang     Aj = Aj_new;
112242550becSJunchao Zhang   }
112342550becSJunchao Zhang 
112442550becSJunchao Zhang   ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr);
112542550becSJunchao Zhang   ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr);
112642550becSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(newmat->data);
112742550becSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */
112842550becSJunchao Zhang   aseq->free_a       = aseq->free_ij = PETSC_TRUE;
112942550becSJunchao Zhang 
113042550becSJunchao Zhang   ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
113142550becSJunchao Zhang   ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
113242550becSJunchao Zhang   ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
113342550becSJunchao Zhang 
113442550becSJunchao Zhang   if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */
113542550becSJunchao Zhang     MatRowMapKokkosViewHost   newperm_h("perm",n-nneg);
113642550becSJunchao Zhang     Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n)));
113742550becSJunchao Zhang     perm_h = newperm_h;
113842550becSJunchao Zhang   }
113942550becSJunchao Zhang 
114042550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
114142550becSJunchao Zhang   akok->SetUpCOO(n,has_repeats,jmap_h,perm_h);
114242550becSJunchao Zhang   PetscFunctionReturn(0);
114342550becSJunchao Zhang }
114442550becSJunchao Zhang 
114542550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
114642550becSJunchao Zhang {
114742550becSJunchao Zhang   PetscErrorCode              ierr;
114842550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
114942550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
115042550becSJunchao Zhang   PetscInt                    nz = aseq->nz;
115142550becSJunchao Zhang   const MatRowMapKokkosView&  jmap = akok->jmap_d;
115242550becSJunchao Zhang   const MatRowMapKokkosView&  perm = akok->perm_d;
115342550becSJunchao Zhang   MatScalarKokkosView         Aa;
115442550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
115542550becSJunchao Zhang   PetscMemType                memtype;
115642550becSJunchao Zhang 
115742550becSJunchao Zhang   PetscFunctionBegin;
115842550becSJunchao Zhang   if (!v) { /* NULL v means an all zero array */
115942550becSJunchao Zhang     ierr = MatZeroEntries(A);CHKERRQ(ierr);
116042550becSJunchao Zhang     PetscFunctionReturn(0);
116142550becSJunchao Zhang   }
116242550becSJunchao Zhang 
116342550becSJunchao Zhang   ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
116442550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
116542550becSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n));
116642550becSJunchao Zhang   } else {
116742550becSJunchao Zhang     kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */
116842550becSJunchao Zhang   }
116942550becSJunchao Zhang 
117042550becSJunchao Zhang   ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */
117142550becSJunchao Zhang   if (imode == INSERT_VALUES) {
117242550becSJunchao Zhang     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
117342550becSJunchao Zhang   }
117442550becSJunchao Zhang 
117542550becSJunchao Zhang   if (akok->coo_has_repeats) {
117642550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {
117742550becSJunchao Zhang       for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));
117842550becSJunchao Zhang     });
117942550becSJunchao Zhang   } else {
118042550becSJunchao Zhang     Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));});
118142550becSJunchao Zhang   }
118242550becSJunchao Zhang   ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr);
118342550becSJunchao Zhang   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
118442550becSJunchao Zhang   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
118542550becSJunchao Zhang   PetscFunctionReturn(0);
118642550becSJunchao Zhang }
118742550becSJunchao Zhang 
118886a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
11898f7e8f9dSMark Adams {
11908f7e8f9dSMark Adams   PetscErrorCode ierr;
11918f7e8f9dSMark Adams 
11928f7e8f9dSMark Adams   PetscFunctionBegin;
11938f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
11948f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
11958f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
11968f7e8f9dSMark Adams   PetscFunctionReturn(0);
11978f7e8f9dSMark Adams }
11988f7e8f9dSMark Adams 
11998c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
12008c3ff71bSJunchao Zhang {
120142550becSJunchao Zhang   PetscErrorCode     ierr;
1202076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1203076ba34aSJunchao Zhang 
12048c3ff71bSJunchao Zhang   PetscFunctionBegin;
1205076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
12066f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
12076f3d89d0SStefano Zampini 
12088c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
12098c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
12108c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1211a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1212f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1213a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1214076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
12158c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
12168c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
12178c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
12188c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
12198c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
12208c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1221076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
12220ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1223152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1224076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1225076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1226076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1227076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1228076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1229076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
123042550becSJunchao Zhang 
123142550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr);
123242550becSJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",       MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr);
1233076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1234076ba34aSJunchao Zhang }
1235076ba34aSJunchao Zhang 
1236076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1237076ba34aSJunchao Zhang {
1238076ba34aSJunchao Zhang   PetscErrorCode ierr;
1239076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1240076ba34aSJunchao Zhang   PetscInt       i,m,n;
1241076ba34aSJunchao Zhang 
1242076ba34aSJunchao Zhang   PetscFunctionBegin;
1243076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1244076ba34aSJunchao Zhang 
1245076ba34aSJunchao Zhang   m    = akok->nrows();
1246076ba34aSJunchao Zhang   n    = akok->ncols();
1247076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1248076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1249076ba34aSJunchao Zhang 
1250076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1251076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1252076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1253076ba34aSJunchao Zhang 
1254076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1255076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1256076ba34aSJunchao Zhang 
1257076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1258076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1259076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1260076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1261076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1262076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1263076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1264076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1265076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1266076ba34aSJunchao Zhang 
1267076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1268076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1269076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1270076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1271076ba34aSJunchao Zhang   }
1272076ba34aSJunchao Zhang 
1273076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1274076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1275076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1276076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1277076ba34aSJunchao Zhang   A->spptr = akok;
1278076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1279076ba34aSJunchao Zhang }
1280076ba34aSJunchao Zhang 
1281076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1282076ba34aSJunchao Zhang 
1283076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1284076ba34aSJunchao Zhang  */
1285076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1286076ba34aSJunchao Zhang {
1287076ba34aSJunchao Zhang   PetscErrorCode ierr;
1288076ba34aSJunchao Zhang 
1289076ba34aSJunchao Zhang   PetscFunctionBegin;
1290076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1291076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
12928c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12938c3ff71bSJunchao Zhang }
12948c3ff71bSJunchao Zhang 
12958c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1296152b3e56SJunchao Zhang /*@C
12978c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
12988c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
12998c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
13008c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
13018c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
13028c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
13038c3ff71bSJunchao Zhang 
13048c3ff71bSJunchao Zhang    Collective
13058c3ff71bSJunchao Zhang 
13068c3ff71bSJunchao Zhang    Input Parameters:
13078c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
13088c3ff71bSJunchao Zhang .  m - number of rows
13098c3ff71bSJunchao Zhang .  n - number of columns
13108c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
13118c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
13128c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
13138c3ff71bSJunchao Zhang 
13148c3ff71bSJunchao Zhang    Output Parameter:
13158c3ff71bSJunchao Zhang .  A - the matrix
13168c3ff71bSJunchao Zhang 
13178c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
13188c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
13198c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
13208c3ff71bSJunchao Zhang 
13218c3ff71bSJunchao Zhang    Notes:
13228c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
13238c3ff71bSJunchao Zhang 
13248c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
13258c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
13268c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
13278c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
13288c3ff71bSJunchao Zhang 
13298c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
13308c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
13318c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
13328c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
13338c3ff71bSJunchao Zhang 
13348c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
13358c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
13368c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
13378c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
13388c3ff71bSJunchao Zhang 
13398c3ff71bSJunchao Zhang    Level: intermediate
13408c3ff71bSJunchao Zhang 
1341076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
13428c3ff71bSJunchao Zhang @*/
13438c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
13448c3ff71bSJunchao Zhang {
13458c3ff71bSJunchao Zhang   PetscErrorCode ierr;
13468c3ff71bSJunchao Zhang 
13478c3ff71bSJunchao Zhang   PetscFunctionBegin;
13488c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
13498c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
13508c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
13518c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
13528c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
13538c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
13548c3ff71bSJunchao Zhang }
1355930e68a5SMark Adams 
13568f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
13578f7e8f9dSMark Adams //
135846804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
13598f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
13608f7e8f9dSMark Adams //
13618f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1362930e68a5SMark Adams {
13638f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
13648f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13658f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1366930e68a5SMark Adams   PetscErrorCode     ierr;
13678f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1368076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1369076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1370076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
13718f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
137246804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1373930e68a5SMark Adams 
1374930e68a5SMark Adams   PetscFunctionBegin;
1375c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
13768f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
13778f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
13788f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
13798f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
13808f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
13818f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
13828f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
13838f7e8f9dSMark Adams   {
13848f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
13858f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
13868f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
13878f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
13888f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
13898f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
13908f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
13918f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
13928f7e8f9dSMark Adams     size_t flops_h = 0.0;
13938f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
13948f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
13958f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
13968f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
13978f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
13988f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
13998f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
14008f7e8f9dSMark Adams     // Fill A --> fact
14018f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1402042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
14038f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
14048f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
14058f7e8f9dSMark Adams         // zero rows of B
14068f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14078f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
14088f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
14098f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
14108f7e8f9dSMark Adams             /* zero (unfactored row) */
14118f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
14128f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
14138f7e8f9dSMark Adams           });
14148f7e8f9dSMark Adams         // copy A into B
14158f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
14168f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
14178f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
14188f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
14198f7e8f9dSMark Adams             /* load in initial (unfactored row) */
14208f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
14218f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
14228f7e8f9dSMark Adams               PetscScalar vala = av[j];
14238f7e8f9dSMark Adams               if (colb == rowb) {
14248f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
14258f7e8f9dSMark Adams               } else {
14268f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14278f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
14288f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
14298f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
14308f7e8f9dSMark Adams                   if (pbj[j] == colb) {
14318f7e8f9dSMark Adams                     pba[j] = vala;
14328f7e8f9dSMark Adams                     set++;
14338f7e8f9dSMark Adams                     break;
14348f7e8f9dSMark Adams                   }
14358f7e8f9dSMark Adams                 }
1436*8f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
14378f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
1438*8f1da0b2SJunchao Zhang                #endif
14398f7e8f9dSMark Adams               }
14408f7e8f9dSMark Adams             }
14418f7e8f9dSMark Adams           });
14428f7e8f9dSMark Adams       });
14438f7e8f9dSMark Adams     Kokkos::fence();
1444930e68a5SMark Adams 
14458f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
14468f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14478f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
14488f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1449042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
14508f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
14518f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
14528f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
14538f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
14548f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
14558f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
14568f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
14578f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
14588f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
14598f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
14608f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
14618f7e8f9dSMark Adams               else {
14628f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
14638f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
14648f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
14658f7e8f9dSMark Adams                 size_t         st_idx;
14668f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
14678f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
14688f7e8f9dSMark Adams                 // get column, there has got to be a better way
14698f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
14708f7e8f9dSMark Adams                     if (pjL[j] == ii) {
14718f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
14728f7e8f9dSMark Adams                       idx = j; // output
14738f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
14748f7e8f9dSMark Adams                     }
14758f7e8f9dSMark Adams                 }, st_idx);
14768f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
1477*8f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
147899551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
147999551766SMark Adams #endif
148099551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
14818f7e8f9dSMark Adams                 // U(i+1,:end)
14828f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
14838f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
14848f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
14858f7e8f9dSMark Adams                       if (col==myk) {
14868f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
14878f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
14888f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
14898f7e8f9dSMark Adams                       } else {
14908f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
14918f7e8f9dSMark Adams                         PetscInt       high, low;
14928f7e8f9dSMark Adams                         const PetscInt *startj;
14938f7e8f9dSMark Adams                         if (col<myk) { // L
14948f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
14958f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
14968f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
14978f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
14988f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
14998f7e8f9dSMark Adams                         } else {
15008f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
15018f7e8f9dSMark Adams                           start = ba_d + idx;
15028f7e8f9dSMark Adams                           startj= bj_d + idx;
15038f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
15048f7e8f9dSMark Adams                         }
15058f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
15068f7e8f9dSMark Adams                         low  = 0;
15078f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
15088f7e8f9dSMark Adams                         while (high-low > 5) {
15098f7e8f9dSMark Adams                           int t = (low+high)/2;
15108f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
15118f7e8f9dSMark Adams                           else                 low  = t;
15128f7e8f9dSMark Adams                         }
15138f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
15148f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
15158f7e8f9dSMark Adams                         }
1516*8f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
151799551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
151899551766SMark Adams #endif
15198f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
15208f7e8f9dSMark Adams                       }
15218f7e8f9dSMark Adams                     });
15228f7e8f9dSMark Adams               }
15238f7e8f9dSMark Adams             });
15248f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
15258f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
15268f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
15278f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
15288f7e8f9dSMark Adams       });
15298f7e8f9dSMark Adams     Kokkos::fence();
15308f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
15318f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
15328f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
15338f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
15348f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
15358f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
15368f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
15378f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
15388f7e8f9dSMark Adams             if (i < end) {
15398f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
15408f7e8f9dSMark Adams               *pv = 1.0/(*pv);
15418f7e8f9dSMark Adams             }
15428f7e8f9dSMark Adams           });
15438f7e8f9dSMark Adams       });
15448f7e8f9dSMark Adams   }
15458f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
15468f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
15478f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
15488f7e8f9dSMark Adams 
15498f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
15508f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
15518f7e8f9dSMark Adams   if (b->inode.size) {
15528f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
15538f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
15548f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
15558f7e8f9dSMark Adams   } else {
15568f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
15578f7e8f9dSMark Adams   }
15588f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
15598f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
15608f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
15618f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
15628f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
15638f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
15648f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
15658f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
15668f7e8f9dSMark Adams 
1567930e68a5SMark Adams   PetscFunctionReturn(0);
1568930e68a5SMark Adams }
1569930e68a5SMark Adams 
157086a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1571930e68a5SMark Adams {
1572930e68a5SMark Adams   PetscErrorCode   ierr;
1573930e68a5SMark Adams 
1574930e68a5SMark Adams   PetscFunctionBegin;
1575930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
157686a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
157786a27549SJunchao Zhang   PetscFunctionReturn(0);
157886a27549SJunchao Zhang }
157986a27549SJunchao Zhang 
158086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
158186a27549SJunchao Zhang {
158286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang   PetscFunctionBegin;
158586a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
158686a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
158786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
158886a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
158986a27549SJunchao Zhang   }
159086a27549SJunchao Zhang   PetscFunctionReturn(0);
159186a27549SJunchao Zhang }
159286a27549SJunchao Zhang 
159386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
159486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
159586a27549SJunchao Zhang {
159686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1597076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
159886a27549SJunchao Zhang 
159986a27549SJunchao Zhang   PetscFunctionBegin;
160086a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
160186a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1602076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
160386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1604076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1605076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
160686a27549SJunchao Zhang 
160786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1608076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1609076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1610076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
161186a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
161286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
161386a27549SJunchao Zhang 
161486a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
161586a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
161686a27549SJunchao Zhang     */
1617076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1618076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
161986a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
162086a27549SJunchao Zhang 
162186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
162286a27549SJunchao Zhang 
162386a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1624076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
162586a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1626076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1627076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
162886a27549SJunchao Zhang 
162986a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1630076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1631076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1632076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
163386a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
163486a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
163586a27549SJunchao Zhang 
163686a27549SJunchao Zhang     /* Sort indices. See comments above */
1637076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1638076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
163986a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
164086a27549SJunchao Zhang 
164186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
164286a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
164386a27549SJunchao Zhang   }
164486a27549SJunchao Zhang   PetscFunctionReturn(0);
164586a27549SJunchao Zhang }
164686a27549SJunchao Zhang 
164786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
164886a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
164986a27549SJunchao Zhang {
165086a27549SJunchao Zhang   PetscErrorCode                 ierr;
165186a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
165286a27549SJunchao Zhang   PetscScalarKokkosView          xv;
165386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
165486a27549SJunchao Zhang 
165586a27549SJunchao Zhang   PetscFunctionBegin;
1656eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
165786a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
165886a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
165986a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
166086a27549SJunchao Zhang   /* Solve L tmpv = b */
16613f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
166286a27549SJunchao Zhang   /* Solve Ux = tmpv */
16633f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
166486a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
166586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1666eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
166786a27549SJunchao Zhang   PetscFunctionReturn(0);
166886a27549SJunchao Zhang }
166986a27549SJunchao Zhang 
1670076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
167186a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
167286a27549SJunchao Zhang {
167386a27549SJunchao Zhang   PetscErrorCode                 ierr;
167486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
167586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
167686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
167786a27549SJunchao Zhang 
167886a27549SJunchao Zhang   PetscFunctionBegin;
1679eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
168086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
168186a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
168286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
168386a27549SJunchao Zhang   /* Solve U^T tmpv = b */
168486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
168586a27549SJunchao Zhang 
168686a27549SJunchao Zhang   /* Solve L^T x = tmpv */
168786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
168886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
168986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1690eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
169186a27549SJunchao Zhang   PetscFunctionReturn(0);
169286a27549SJunchao Zhang }
169386a27549SJunchao Zhang 
169486a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
169586a27549SJunchao Zhang {
169686a27549SJunchao Zhang   PetscErrorCode                 ierr;
169786a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
169886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
169986a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
170086a27549SJunchao Zhang 
170186a27549SJunchao Zhang   PetscFunctionBegin;
1702eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
170386a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1704076ba34aSJunchao Zhang 
1705076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1706076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1707076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1708076ba34aSJunchao Zhang 
1709076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
171086a27549SJunchao Zhang 
171186a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
171286a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
171386a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
171486a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
171586a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
171686a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
171786a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
171886a27549SJunchao Zhang 
171986a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
172086a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
172186a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1722eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1723eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
172486a27549SJunchao Zhang   PetscFunctionReturn(0);
172586a27549SJunchao Zhang }
172686a27549SJunchao Zhang 
172786a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
172886a27549SJunchao Zhang {
172986a27549SJunchao Zhang   PetscErrorCode                 ierr;
173086a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
173186a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
173286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
173386a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
173486a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
173586a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
173686a27549SJunchao Zhang 
173786a27549SJunchao Zhang   PetscFunctionBegin;
173886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
173986a27549SJunchao Zhang   /* Rebuild factors */
174086a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
174186a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
174286a27549SJunchao Zhang 
174386a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
174486a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
174586a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
174686a27549SJunchao Zhang 
174786a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
174886a27549SJunchao Zhang 
174986a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
175086a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
175186a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
175286a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
175386a27549SJunchao Zhang 
175486a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1755076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1756076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1757076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
175886a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
175986a27549SJunchao Zhang 
176086a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
176186a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
176286a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
176386a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
176486a27549SJunchao Zhang 
176586a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
176686a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
176786a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
176886a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
176986a27549SJunchao Zhang  #else
177086a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
177186a27549SJunchao Zhang  #endif
177286a27549SJunchao Zhang 
177386a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
177486a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
177586a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
177686a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
177786a27549SJunchao Zhang 
177886a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
177986a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
178086a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
178186a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
178286a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
178386a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
178486a27549SJunchao Zhang 
178586a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
178686a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1787930e68a5SMark Adams   PetscFunctionReturn(0);
1788930e68a5SMark Adams }
1789930e68a5SMark Adams 
17908f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
17918f7e8f9dSMark Adams {
17928f7e8f9dSMark Adams   PetscErrorCode   ierr;
17938f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
17948f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1795930e68a5SMark Adams 
17968f7e8f9dSMark Adams   PetscFunctionBegin;
17978f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
17988f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
17998f7e8f9dSMark Adams   // move B data into Kokkos
18008f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
18018f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
18028f7e8f9dSMark Adams   {
18038f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
18048f7e8f9dSMark Adams     if (!baijkok->diag_d) {
18058f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1806152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
18078f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
18088f7e8f9dSMark Adams     }
18098f7e8f9dSMark Adams   }
18108f7e8f9dSMark Adams   PetscFunctionReturn(0);
18118f7e8f9dSMark Adams }
18128f7e8f9dSMark Adams 
181386a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1814930e68a5SMark Adams {
1815930e68a5SMark Adams   PetscFunctionBegin;
1816930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1817930e68a5SMark Adams   PetscFunctionReturn(0);
1818930e68a5SMark Adams }
1819930e68a5SMark Adams 
18208f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
18218f7e8f9dSMark Adams {
18228f7e8f9dSMark Adams   PetscFunctionBegin;
18238f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
18248f7e8f9dSMark Adams   PetscFunctionReturn(0);
18258f7e8f9dSMark Adams }
18268f7e8f9dSMark Adams 
1827930e68a5SMark Adams /*MC
182886a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
182986a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1830930e68a5SMark Adams 
1831930e68a5SMark Adams   Level: beginner
1832930e68a5SMark Adams 
183386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1834930e68a5SMark Adams M*/
183586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1836930e68a5SMark Adams {
1837930e68a5SMark Adams   PetscErrorCode ierr;
1838930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1839930e68a5SMark Adams 
1840930e68a5SMark Adams   PetscFunctionBegin;
1841930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1842930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1843930e68a5SMark Adams   (*B)->factortype = ftype;
18444ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1845930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1846930e68a5SMark Adams 
18478f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1848930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
184986a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
185086a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
185186a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
185286a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
185386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
185486a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
185586a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1856930e68a5SMark Adams 
1857930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
185886a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1859930e68a5SMark Adams   PetscFunctionReturn(0);
1860930e68a5SMark Adams }
18618f7e8f9dSMark Adams 
18628f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
18638f7e8f9dSMark Adams {
18648f7e8f9dSMark Adams   PetscErrorCode ierr;
18658f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
18668f7e8f9dSMark Adams 
18678f7e8f9dSMark Adams   PetscFunctionBegin;
18688f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
18698f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
18708f7e8f9dSMark Adams   (*B)->factortype = ftype;
1871f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
18724ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
18738f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
18748f7e8f9dSMark Adams 
18758f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
18768f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
18778f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
18788f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
18798f7e8f9dSMark Adams 
18808f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
18818f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
18828f7e8f9dSMark Adams   PetscFunctionReturn(0);
18838f7e8f9dSMark Adams }
188486a27549SJunchao Zhang 
188586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
188686a27549SJunchao Zhang {
188786a27549SJunchao Zhang   PetscErrorCode ierr;
188886a27549SJunchao Zhang 
188986a27549SJunchao Zhang   PetscFunctionBegin;
189086a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
189186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
189286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
189386a27549SJunchao Zhang   PetscFunctionReturn(0);
189486a27549SJunchao Zhang }
189586a27549SJunchao Zhang 
1896076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1897076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1898076ba34aSJunchao Zhang {
1899076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1900076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1901076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1902076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1903076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1904076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1905076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1906076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1907076ba34aSJunchao Zhang 
1908076ba34aSJunchao Zhang   PetscFunctionBegin;
1909c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1910076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1911c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1912076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1913c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1914076ba34aSJunchao Zhang     }
1915076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1916076ba34aSJunchao Zhang   }
1917076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1918076ba34aSJunchao Zhang }
1919