xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision eeadb3416b8208c7cfb3934b3e389a151ad70de5)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
48c3ff71bSJunchao Zhang #include <petscsystypes.h>
58c3ff71bSJunchao Zhang #include <petscerror.h>
68c3ff71bSJunchao Zhang 
78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
8f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
6186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
7986a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
842328674fSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8686a27549SJunchao Zhang   PetscFunctionReturn(0);
8786a27549SJunchao Zhang }
8886a27549SJunchao Zhang 
89f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
90f0cf5187SStefano Zampini {
91f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
92f0cf5187SStefano Zampini 
93f0cf5187SStefano Zampini   PetscFunctionBegin;
94f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9586a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9686a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
97f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
98076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
99f0cf5187SStefano Zampini   PetscFunctionReturn(0);
100f0cf5187SStefano Zampini }
101f0cf5187SStefano Zampini 
102f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
103f0cf5187SStefano Zampini {
104076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
105f0cf5187SStefano Zampini 
106f0cf5187SStefano Zampini   PetscFunctionBegin;
107076ba34aSJunchao Zhang   if (aijkok) {
108076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
109076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
110076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
111076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
112076ba34aSJunchao Zhang   }
113076ba34aSJunchao Zhang   PetscFunctionReturn(0);
114076ba34aSJunchao Zhang }
115076ba34aSJunchao Zhang 
116076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
117076ba34aSJunchao Zhang {
118076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
119076ba34aSJunchao Zhang 
120076ba34aSJunchao Zhang   PetscFunctionBegin;
121076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
122076ba34aSJunchao Zhang   PetscFunctionReturn(0);
123076ba34aSJunchao Zhang }
124076ba34aSJunchao Zhang 
125076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
126076ba34aSJunchao Zhang {
127076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
128076ba34aSJunchao Zhang 
129076ba34aSJunchao Zhang   PetscFunctionBegin;
1302328674fSJunchao Zhang   if (aijkok) {
131076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
132076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1332328674fSJunchao Zhang   } else {
1342328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1352328674fSJunchao Zhang   }
136076ba34aSJunchao Zhang   PetscFunctionReturn(0);
137076ba34aSJunchao Zhang }
138076ba34aSJunchao Zhang 
139076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
140076ba34aSJunchao Zhang {
141076ba34aSJunchao Zhang   PetscFunctionBegin;
142076ba34aSJunchao Zhang   *array = NULL;
143076ba34aSJunchao Zhang   PetscFunctionReturn(0);
144076ba34aSJunchao Zhang }
145076ba34aSJunchao Zhang 
146076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
147076ba34aSJunchao Zhang {
148076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang   PetscFunctionBegin;
1512328674fSJunchao Zhang   if (aijkok) {
152076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1532328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1542328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1552328674fSJunchao Zhang   }
156076ba34aSJunchao Zhang   PetscFunctionReturn(0);
157076ba34aSJunchao Zhang }
158076ba34aSJunchao Zhang 
159076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
160076ba34aSJunchao Zhang {
161076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
162076ba34aSJunchao Zhang 
163076ba34aSJunchao Zhang   PetscFunctionBegin;
1642328674fSJunchao Zhang   if (aijkok) {
165076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
166076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1672328674fSJunchao Zhang   }
168f0cf5187SStefano Zampini   PetscFunctionReturn(0);
169f0cf5187SStefano Zampini }
170f0cf5187SStefano Zampini 
171a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
172042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
173a587d139SMark {
174042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
175042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
176a587d139SMark 
177a587d139SMark   PetscFunctionBegin;
178076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
179152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
180a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
181a587d139SMark   PetscFunctionReturn(0);
182a587d139SMark }
183a587d139SMark 
184a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
185042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
186a587d139SMark {
187042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188a587d139SMark 
189a587d139SMark   PetscFunctionBegin;
190a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
191a587d139SMark     *d_mat = aijkok->device_mat_d.data();
192a587d139SMark   } else {
193a587d139SMark     PetscErrorCode   ierr;
194a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
195a587d139SMark     *d_mat  = NULL;
196a587d139SMark   }
197a587d139SMark   PetscFunctionReturn(0);
198a587d139SMark }
199076ba34aSJunchao Zhang 
200076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
201076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
202152b3e56SJunchao Zhang {
203152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
204152b3e56SJunchao Zhang 
205152b3e56SJunchao Zhang   PetscFunctionBegin;
206076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
207076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
208076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
209076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
210076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
211076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
21286a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
213076ba34aSJunchao Zhang   }
214076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
215152b3e56SJunchao Zhang   PetscFunctionReturn(0);
216152b3e56SJunchao Zhang }
217152b3e56SJunchao Zhang 
218076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
220152b3e56SJunchao Zhang {
221*eeadb341SJunchao Zhang   PetscErrorCode                   ierr;
222152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
223152b3e56SJunchao Zhang 
224152b3e56SJunchao Zhang   PetscFunctionBegin;
225*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
226076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
227076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
228076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
229076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
230076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
231076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
232076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
233076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
234076ba34aSJunchao Zhang    #endif
23586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
236076ba34aSJunchao Zhang   }
237076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
238*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
239152b3e56SJunchao Zhang   PetscFunctionReturn(0);
240152b3e56SJunchao Zhang }
241a587d139SMark 
2428c3ff71bSJunchao Zhang /* y = A x */
2438c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2448c3ff71bSJunchao Zhang {
2458c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2468c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
247152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
248152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang   PetscFunctionBegin;
2518c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
252152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2548c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
255*eeadb341SJunchao Zhang   ierr   = PetscLogGpuTimeBegin();CHKERRQ(ierr);
256152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
257152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
258152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
259076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
260152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2618c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2628c3ff71bSJunchao Zhang }
2638c3ff71bSJunchao Zhang 
2648c3ff71bSJunchao Zhang /* y = A^T x */
2658c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2668c3ff71bSJunchao Zhang {
2678c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2688c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
269152b3e56SJunchao Zhang   const char                       *mode;
270152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
271152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
272076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2738c3ff71bSJunchao Zhang 
2748c3ff71bSJunchao Zhang   PetscFunctionBegin;
275*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2768c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
277152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
278152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
279152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
280076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
281152b3e56SJunchao Zhang     mode = "N";
282152b3e56SJunchao Zhang   } else {
283076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
284076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
285152b3e56SJunchao Zhang     mode = "T";
286152b3e56SJunchao Zhang   }
287076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
288152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
289152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
290076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
291*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2928c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2938c3ff71bSJunchao Zhang }
2948c3ff71bSJunchao Zhang 
2958c3ff71bSJunchao Zhang /* y = A^H x */
2968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2978c3ff71bSJunchao Zhang {
2988c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2998c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
300152b3e56SJunchao Zhang   const char                       *mode;
301152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
302152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
303076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3048c3ff71bSJunchao Zhang 
3058c3ff71bSJunchao Zhang   PetscFunctionBegin;
306*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
308152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
309152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
311076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
312152b3e56SJunchao Zhang     mode = "N";
313152b3e56SJunchao Zhang   } else {
314076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
315076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
316152b3e56SJunchao Zhang     mode = "C";
317152b3e56SJunchao Zhang   }
318076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
319152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
320152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
321076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
322*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3248c3ff71bSJunchao Zhang }
3258c3ff71bSJunchao Zhang 
3268c3ff71bSJunchao Zhang /* z = A x + y */
3278c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3288c3ff71bSJunchao Zhang {
3298c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3308c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
331152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
332152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang   PetscFunctionBegin;
335*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3368c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
337152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
338152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
339152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3408c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3418c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
342152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
343152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
347*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3488c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3498c3ff71bSJunchao Zhang }
3508c3ff71bSJunchao Zhang 
3518c3ff71bSJunchao Zhang /* z = A^T x + y */
3528c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3538c3ff71bSJunchao Zhang {
3548c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3558c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
356152b3e56SJunchao Zhang   const char                       *mode;
357152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
358152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
359076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3608c3ff71bSJunchao Zhang 
3618c3ff71bSJunchao Zhang   PetscFunctionBegin;
362*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
364152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
365152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
366152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3678c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
368152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
369076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
370152b3e56SJunchao Zhang     mode = "N";
371152b3e56SJunchao Zhang   } else {
372076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
373076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
374152b3e56SJunchao Zhang     mode = "T";
375152b3e56SJunchao Zhang   }
376076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
377152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
378152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
380076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
381*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3838c3ff71bSJunchao Zhang }
3848c3ff71bSJunchao Zhang 
3858c3ff71bSJunchao Zhang /* z = A^H x + y */
3868c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3878c3ff71bSJunchao Zhang {
3888c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3898c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
390152b3e56SJunchao Zhang   const char                       *mode;
391152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
392152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
393076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang   PetscFunctionBegin;
396*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3978c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
398152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
399152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
400152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
4018c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
402152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
403076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
404152b3e56SJunchao Zhang     mode = "N";
405152b3e56SJunchao Zhang   } else {
406076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
407076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
408152b3e56SJunchao Zhang     mode = "C";
409152b3e56SJunchao Zhang   }
410076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
411152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
412152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
413152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
414076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
415*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
416152b3e56SJunchao Zhang   PetscFunctionReturn(0);
417152b3e56SJunchao Zhang }
418152b3e56SJunchao Zhang 
419152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
420152b3e56SJunchao Zhang {
421152b3e56SJunchao Zhang   PetscErrorCode            ierr;
422152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
423152b3e56SJunchao Zhang 
424152b3e56SJunchao Zhang   PetscFunctionBegin;
425152b3e56SJunchao Zhang   switch (op) {
426152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
427152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
428152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
429152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
430152b3e56SJunchao Zhang       break;
431152b3e56SJunchao Zhang     default:
432152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
433152b3e56SJunchao Zhang       break;
434152b3e56SJunchao Zhang   }
4358c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4368c3ff71bSJunchao Zhang }
4378c3ff71bSJunchao Zhang 
438076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4393d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4408c3ff71bSJunchao Zhang {
4418c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
442076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4438c3ff71bSJunchao Zhang 
4448c3ff71bSJunchao Zhang   PetscFunctionBegin;
445a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
446076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
447076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4488c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
449076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
450076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
451076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
452076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
453076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
454076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
455076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
456076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
457076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
458076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
459076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4608c3ff71bSJunchao Zhang     }
461076ba34aSJunchao Zhang   }
4628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4638c3ff71bSJunchao Zhang }
4648c3ff71bSJunchao Zhang 
465076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
466076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
467076ba34aSJunchao Zhang  */
468076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4698c3ff71bSJunchao Zhang {
4708c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
471076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
472076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
473076ba34aSJunchao Zhang   Mat                   mat;
4748c3ff71bSJunchao Zhang 
4758c3ff71bSJunchao Zhang   PetscFunctionBegin;
476076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
477076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
478076ba34aSJunchao Zhang   mat  = *B;
479076ba34aSJunchao Zhang   if (A->assembled) {
480076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
481076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
482076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
483076ba34aSJunchao Zhang     /* Now copy values to B if needed */
484076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
485076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
486076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
487076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
488076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
489076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
490076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
491076ba34aSJunchao Zhang       }
492076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
493076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
494076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
495076ba34aSJunchao Zhang     }
496076ba34aSJunchao Zhang     mat->spptr = bkok;
497076ba34aSJunchao Zhang   }
498076ba34aSJunchao Zhang 
499076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
500076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
501076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
502076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
5038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5048c3ff71bSJunchao Zhang }
5058c3ff71bSJunchao Zhang 
5060ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
5070ecb592aSJunchao Zhang {
5080ecb592aSJunchao Zhang   PetscErrorCode    ierr;
5090ecb592aSJunchao Zhang   Mat               At;
5100ecb592aSJunchao Zhang   KokkosCsrMatrix   *internT,*csrmatT;
5110ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
5120ecb592aSJunchao Zhang 
5130ecb592aSJunchao Zhang   PetscFunctionBegin;
5140ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
5150ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
5160ecb592aSJunchao Zhang     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
5170ecb592aSJunchao Zhang     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
5180ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
5190ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
5207a3b1c56SStefano Zampini     else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
5210ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5220ecb592aSJunchao Zhang     if ((*B)->assembled) {
5230ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5240ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
5250ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
5260ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5270ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5280ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5290ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5300ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5310ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5320ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5330ecb592aSJunchao Zhang   }
5340ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5350ecb592aSJunchao Zhang }
5360ecb592aSJunchao Zhang 
5378c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5388c3ff71bSJunchao Zhang {
5398c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
54086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5418c3ff71bSJunchao Zhang 
5428c3ff71bSJunchao Zhang   PetscFunctionBegin;
54386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
54486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5458f7e8f9dSMark Adams     if (aijkok) {
5468f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
547a587d139SMark         delete aijkok->colmap_d;
548a587d139SMark         delete aijkok->i_uncompressed_d;
549a587d139SMark       }
5508f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5518f7e8f9dSMark Adams     }
5528c3ff71bSJunchao Zhang     delete aijkok;
55386a27549SJunchao Zhang   } else {
55486a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
55586a27549SJunchao Zhang   }
556152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
557152b3e56SJunchao Zhang   A->spptr = NULL;
5588c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5598c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5608c3ff71bSJunchao Zhang }
5618c3ff71bSJunchao Zhang 
56286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
56386a27549SJunchao Zhang {
56486a27549SJunchao Zhang   PetscErrorCode ierr;
56586a27549SJunchao Zhang 
56686a27549SJunchao Zhang   PetscFunctionBegin;
56786a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
56886a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
56986a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
57086a27549SJunchao Zhang   PetscFunctionReturn(0);
57186a27549SJunchao Zhang }
57286a27549SJunchao Zhang 
573076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
574076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
575a3f881fbSStefano Zampini {
576076ba34aSJunchao Zhang   PetscErrorCode               ierr;
577076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
578076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
579076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
580076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
581076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
582076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
583a3f881fbSStefano Zampini 
584a3f881fbSStefano Zampini   PetscFunctionBegin;
585076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
586076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
587076ba34aSJunchao Zhang   PetscValidPointer(C,4);
588076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
589076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
590c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
591076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
592076ba34aSJunchao Zhang 
593076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
594076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
595076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
596076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
597076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
599076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
600076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
601076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
602076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
603076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
604076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
605076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
606076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
607076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
608076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
609076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
610076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
611076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
612076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
613076ba34aSJunchao Zhang     ca = ca_dual.view_device();
614076ba34aSJunchao Zhang     ci = ci_dual.view_device();
615076ba34aSJunchao Zhang     cj = cj_dual.view_device();
616076ba34aSJunchao Zhang 
617076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
618076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
619076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
620076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
621076ba34aSJunchao Zhang 
622076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
623076ba34aSJunchao Zhang         ci(i) = coffset;
624076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
625076ba34aSJunchao Zhang       });
626076ba34aSJunchao Zhang 
627076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
628076ba34aSJunchao Zhang         if (k < alen) {
629076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
630076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
631076ba34aSJunchao Zhang         } else {
632076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
633076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
634076ba34aSJunchao Zhang         }
635076ba34aSJunchao Zhang       });
636076ba34aSJunchao Zhang     });
637076ba34aSJunchao Zhang     ca_dual.modify_device();
638076ba34aSJunchao Zhang     ci_dual.modify_device();
639076ba34aSJunchao Zhang     cj_dual.modify_device();
640076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
641076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
642076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
643076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
644076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
645076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
646076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
647076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
648076ba34aSJunchao Zhang 
649076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
650076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
651076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
652076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
653076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
654076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
655076ba34aSJunchao Zhang       });
656076ba34aSJunchao Zhang     });
657076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
658076ba34aSJunchao Zhang   }
659076ba34aSJunchao Zhang   PetscFunctionReturn(0);
660076ba34aSJunchao Zhang }
661076ba34aSJunchao Zhang 
662076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
663076ba34aSJunchao Zhang {
664076ba34aSJunchao Zhang   PetscFunctionBegin;
665076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
666a3f881fbSStefano Zampini   PetscFunctionReturn(0);
667a3f881fbSStefano Zampini }
668a3f881fbSStefano Zampini 
669a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
670a3f881fbSStefano Zampini {
671076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
672a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
673a3f881fbSStefano Zampini   Mat                            A,B;
674076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
675a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
676a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
677076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
678076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
679a3f881fbSStefano Zampini 
680a3f881fbSStefano Zampini   PetscFunctionBegin;
681a3f881fbSStefano Zampini   MatCheckProduct(C,1);
682a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
683076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
684076ba34aSJunchao Zhang 
685076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
686076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
687076ba34aSJunchao Zhang     PetscFunctionReturn(0);
688076ba34aSJunchao Zhang   }
689076ba34aSJunchao Zhang 
690076ba34aSJunchao Zhang   switch (product->type) {
691076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
692076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
693076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
694076ba34aSJunchao Zhang     default:
695076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
696076ba34aSJunchao Zhang   }
697076ba34aSJunchao Zhang 
698a3f881fbSStefano Zampini   A     = product->A;
699a3f881fbSStefano Zampini   B     = product->B;
700a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
701a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
702a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
703a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
704a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
705076ba34aSJunchao Zhang 
706076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
707076ba34aSJunchao Zhang 
708076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
709076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
710076ba34aSJunchao Zhang 
711076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
712076ba34aSJunchao Zhang   if (transA) {
713076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
714076ba34aSJunchao Zhang     transA = false;
715a3f881fbSStefano Zampini   }
716a3f881fbSStefano Zampini 
717076ba34aSJunchao Zhang   if (transB) {
718076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
719076ba34aSJunchao Zhang     transB = false;
720076ba34aSJunchao Zhang   }
721*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
722076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
723076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
724*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
725076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
726a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
727a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
728c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
729a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
730c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
731a3f881fbSStefano Zampini   c->reallocs         = 0;
732076ba34aSJunchao Zhang   C->info.mallocs     = 0;
733a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
734a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
735a3f881fbSStefano Zampini   C->num_ass++;
736a3f881fbSStefano Zampini   PetscFunctionReturn(0);
737a3f881fbSStefano Zampini }
738a3f881fbSStefano Zampini 
739a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
740a3f881fbSStefano Zampini {
741a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
742076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
743076ba34aSJunchao Zhang   MatProductType                 ptype;
744076ba34aSJunchao Zhang   Mat                            A,B;
745076ba34aSJunchao Zhang   bool                           transA,transB;
746076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
747076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
748076ba34aSJunchao Zhang   MPI_Comm                       comm;
749076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
750a3f881fbSStefano Zampini 
751a3f881fbSStefano Zampini   PetscFunctionBegin;
752a3f881fbSStefano Zampini   MatCheckProduct(C,1);
753076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
754076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
755a3f881fbSStefano Zampini   A       = product->A;
756a3f881fbSStefano Zampini   B       = product->B;
757a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
758a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
759a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
760a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
761076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
762076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
763076ba34aSJunchao Zhang 
764a3f881fbSStefano Zampini   ptype   = product->type;
765a3f881fbSStefano Zampini   switch (ptype) {
766076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
767076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
768076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
769a3f881fbSStefano Zampini     default:
770076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
771a3f881fbSStefano Zampini   }
772a3f881fbSStefano Zampini 
773076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
774076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
775076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
776076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
777a3f881fbSStefano Zampini 
778076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
779076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7806ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7816ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7826ffb9508SJunchao Zhang    */
783076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
784076ba34aSJunchao Zhang 
785*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
786076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
787076ba34aSJunchao Zhang   if (transA) {
788076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
789076ba34aSJunchao Zhang     transA = false;
790076ba34aSJunchao Zhang   }
791076ba34aSJunchao Zhang 
792076ba34aSJunchao Zhang   if (transB) {
793076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
794076ba34aSJunchao Zhang     transB = false;
795076ba34aSJunchao Zhang   }
796076ba34aSJunchao Zhang 
797076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
798076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
799076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
800076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
801076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
802076ba34aSJunchao Zhang   */
803076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
804076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
805*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
806076ba34aSJunchao Zhang 
807076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
808076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
809076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
810a3f881fbSStefano Zampini   PetscFunctionReturn(0);
811a3f881fbSStefano Zampini }
812a3f881fbSStefano Zampini 
813a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
814076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
815a3f881fbSStefano Zampini {
816a3f881fbSStefano Zampini   PetscErrorCode ierr;
817076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
818a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
819a3f881fbSStefano Zampini 
820a3f881fbSStefano Zampini   PetscFunctionBegin;
821a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
822a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
823a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
824a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
825a3f881fbSStefano Zampini   }
826a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
827a3f881fbSStefano Zampini     switch (product->type) {
828a3f881fbSStefano Zampini     case MATPRODUCT_AB:
829a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
830a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
831a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
832a3f881fbSStefano Zampini       break;
833a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
834a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
835a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
836a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
837a3f881fbSStefano Zampini       break;
838a3f881fbSStefano Zampini     default:
839a3f881fbSStefano Zampini       break;
840a3f881fbSStefano Zampini     }
841a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
842a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
843a3f881fbSStefano Zampini   }
844a3f881fbSStefano Zampini   PetscFunctionReturn(0);
845a3f881fbSStefano Zampini }
846a587d139SMark 
847f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
848f0cf5187SStefano Zampini {
849f0cf5187SStefano Zampini   PetscErrorCode   ierr;
850f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
851f0cf5187SStefano Zampini 
852f0cf5187SStefano Zampini   PetscFunctionBegin;
853*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
854f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
855f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
856076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
857076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
858076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
859*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
860f0cf5187SStefano Zampini   PetscFunctionReturn(0);
861f0cf5187SStefano Zampini }
862f0cf5187SStefano Zampini 
863a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
864a587d139SMark {
865a587d139SMark   PetscErrorCode   ierr;
866076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
867a587d139SMark 
868a587d139SMark   PetscFunctionBegin;
869076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8702328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
871076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
872076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
8732328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
8742328674fSJunchao Zhang     ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
8752328674fSJunchao Zhang   }
876a587d139SMark   PetscFunctionReturn(0);
877a587d139SMark }
878a587d139SMark 
879db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
880db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
881db78de30SJunchao Zhang {
882db78de30SJunchao Zhang   PetscErrorCode     ierr;
883db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
884db78de30SJunchao Zhang 
885db78de30SJunchao Zhang   PetscFunctionBegin;
886db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
887db78de30SJunchao Zhang   PetscValidPointer(kv,2);
888db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
889db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
890db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
891076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
892db78de30SJunchao Zhang   PetscFunctionReturn(0);
893db78de30SJunchao Zhang }
894db78de30SJunchao Zhang 
895db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
896db78de30SJunchao Zhang {
897db78de30SJunchao Zhang   PetscFunctionBegin;
898db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
899db78de30SJunchao Zhang   PetscValidPointer(kv,2);
900db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
901db78de30SJunchao Zhang   PetscFunctionReturn(0);
902db78de30SJunchao Zhang }
903db78de30SJunchao Zhang 
904db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
905db78de30SJunchao Zhang {
906db78de30SJunchao Zhang   PetscErrorCode     ierr;
907db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
908db78de30SJunchao Zhang 
909db78de30SJunchao Zhang   PetscFunctionBegin;
910db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
911db78de30SJunchao Zhang   PetscValidPointer(kv,2);
912db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
913db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
914db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
915076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
916db78de30SJunchao Zhang   PetscFunctionReturn(0);
917db78de30SJunchao Zhang }
918db78de30SJunchao Zhang 
919db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
920db78de30SJunchao Zhang {
921db78de30SJunchao Zhang   PetscErrorCode     ierr;
922db78de30SJunchao Zhang 
923db78de30SJunchao Zhang   PetscFunctionBegin;
924db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
925db78de30SJunchao Zhang   PetscValidPointer(kv,2);
926db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
927076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
928db78de30SJunchao Zhang   PetscFunctionReturn(0);
929db78de30SJunchao Zhang }
930db78de30SJunchao Zhang 
931db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
932db78de30SJunchao Zhang {
933db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
934db78de30SJunchao Zhang 
935db78de30SJunchao Zhang   PetscFunctionBegin;
936db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
937db78de30SJunchao Zhang   PetscValidPointer(kv,2);
938db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
939db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
940076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
941db78de30SJunchao Zhang   PetscFunctionReturn(0);
942db78de30SJunchao Zhang }
943db78de30SJunchao Zhang 
944db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
945db78de30SJunchao Zhang {
946db78de30SJunchao Zhang   PetscErrorCode     ierr;
947db78de30SJunchao Zhang 
948db78de30SJunchao Zhang   PetscFunctionBegin;
949db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
950db78de30SJunchao Zhang   PetscValidPointer(kv,2);
951db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
952076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
953db78de30SJunchao Zhang   PetscFunctionReturn(0);
954db78de30SJunchao Zhang }
955db78de30SJunchao Zhang 
956c17cf699SJunchao Zhang /* Computes Y += alpha X */
957c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
958a587d139SMark {
959a587d139SMark   PetscErrorCode             ierr;
960a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
961c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
962c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
963c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
964a587d139SMark 
965a587d139SMark   PetscFunctionBegin;
966c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
967c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
968c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
969c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
970*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
971db78de30SJunchao Zhang 
972c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
973c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
974a587d139SMark     PetscBool e;
975a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
976a587d139SMark     if (e) {
977a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
978c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
979a587d139SMark     }
980a587d139SMark   }
981db78de30SJunchao Zhang 
982c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
983c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
984c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
985c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
986c17cf699SJunchao Zhang   */
987c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
988c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
989c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
990c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
991c17cf699SJunchao Zhang 
992c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
993c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
994c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
995c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
996c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
997c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
998c17cf699SJunchao Zhang 
999c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
1000c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
1001c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
1002c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
1003c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
1004c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
1005c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
1006c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
1007c17cf699SJunchao Zhang             q++;
1008a587d139SMark           } else {
1009c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1010c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1011c17cf699SJunchao Zhang             */
1012c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
1013a587d139SMark           }
1014c17cf699SJunchao Zhang         }
1015c17cf699SJunchao Zhang       });
1016c17cf699SJunchao Zhang     });
1017c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
1018c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
1019c17cf699SJunchao Zhang     Mat             Z;
1020c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
1021c17cf699SJunchao Zhang     KernelHandle    kh;
1022c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1023c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1024c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1025c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1026c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1027c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1028c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1029c17cf699SJunchao Zhang   }
1030*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1031*eeadb341SJunchao Zhang   ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */
1032a587d139SMark   PetscFunctionReturn(0);
1033a587d139SMark }
1034a587d139SMark 
103586a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10368f7e8f9dSMark Adams {
10378f7e8f9dSMark Adams   PetscErrorCode ierr;
10388f7e8f9dSMark Adams 
10398f7e8f9dSMark Adams   PetscFunctionBegin;
10408f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
10418f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
10428f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10438f7e8f9dSMark Adams   PetscFunctionReturn(0);
10448f7e8f9dSMark Adams }
10458f7e8f9dSMark Adams 
10468c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
10478c3ff71bSJunchao Zhang {
1048076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1049076ba34aSJunchao Zhang 
10508c3ff71bSJunchao Zhang   PetscFunctionBegin;
1051076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
10526f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
10536f3d89d0SStefano Zampini 
10548c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
10558c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
10568c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1057a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1058f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1059a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1060076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
10618c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
10628c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
10638c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
10648c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
10658c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
10668c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1067076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
10680ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1069152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1070076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1071076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1072076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1073076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1074076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1075076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1076076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1077076ba34aSJunchao Zhang }
1078076ba34aSJunchao Zhang 
1079076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1080076ba34aSJunchao Zhang {
1081076ba34aSJunchao Zhang   PetscErrorCode ierr;
1082076ba34aSJunchao Zhang   Mat_SeqAIJ     *aseq;
1083076ba34aSJunchao Zhang   PetscInt       i,m,n;
1084076ba34aSJunchao Zhang 
1085076ba34aSJunchao Zhang   PetscFunctionBegin;
1086076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1087076ba34aSJunchao Zhang 
1088076ba34aSJunchao Zhang   m    = akok->nrows();
1089076ba34aSJunchao Zhang   n    = akok->ncols();
1090076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1091076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1092076ba34aSJunchao Zhang 
1093076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1094076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1095076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1096076ba34aSJunchao Zhang 
1097076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1098076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1099076ba34aSJunchao Zhang 
1100076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1101076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1102076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1103076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1104076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1105076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1106076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1107076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1108076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1109076ba34aSJunchao Zhang 
1110076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1111076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1112076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1113076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1114076ba34aSJunchao Zhang   }
1115076ba34aSJunchao Zhang 
1116076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1117076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1118076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1119076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1120076ba34aSJunchao Zhang   A->spptr = akok;
1121076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1122076ba34aSJunchao Zhang }
1123076ba34aSJunchao Zhang 
1124076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1125076ba34aSJunchao Zhang 
1126076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1127076ba34aSJunchao Zhang  */
1128076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1129076ba34aSJunchao Zhang {
1130076ba34aSJunchao Zhang   PetscErrorCode ierr;
1131076ba34aSJunchao Zhang 
1132076ba34aSJunchao Zhang   PetscFunctionBegin;
1133076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1134076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
11358c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11368c3ff71bSJunchao Zhang }
11378c3ff71bSJunchao Zhang 
11388c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1139152b3e56SJunchao Zhang /*@C
11408c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
11418c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
11428c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
11438c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
11448c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
11458c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
11468c3ff71bSJunchao Zhang 
11478c3ff71bSJunchao Zhang    Collective
11488c3ff71bSJunchao Zhang 
11498c3ff71bSJunchao Zhang    Input Parameters:
11508c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
11518c3ff71bSJunchao Zhang .  m - number of rows
11528c3ff71bSJunchao Zhang .  n - number of columns
11538c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
11548c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
11558c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
11568c3ff71bSJunchao Zhang 
11578c3ff71bSJunchao Zhang    Output Parameter:
11588c3ff71bSJunchao Zhang .  A - the matrix
11598c3ff71bSJunchao Zhang 
11608c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
11618c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
11628c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
11638c3ff71bSJunchao Zhang 
11648c3ff71bSJunchao Zhang    Notes:
11658c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
11668c3ff71bSJunchao Zhang 
11678c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
11688c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
11698c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
11708c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
11718c3ff71bSJunchao Zhang 
11728c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
11738c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11748c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
11758c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
11768c3ff71bSJunchao Zhang 
11778c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
11788c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
11798c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
11808c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
11818c3ff71bSJunchao Zhang 
11828c3ff71bSJunchao Zhang    Level: intermediate
11838c3ff71bSJunchao Zhang 
1184076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
11858c3ff71bSJunchao Zhang @*/
11868c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11878c3ff71bSJunchao Zhang {
11888c3ff71bSJunchao Zhang   PetscErrorCode ierr;
11898c3ff71bSJunchao Zhang 
11908c3ff71bSJunchao Zhang   PetscFunctionBegin;
11918c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
11928c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11938c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11948c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11958c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11978c3ff71bSJunchao Zhang }
1198930e68a5SMark Adams 
11998f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12008f7e8f9dSMark Adams //
12018f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
12028f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12038f7e8f9dSMark Adams //
12048f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1205930e68a5SMark Adams {
12068f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12078f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12088f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1209930e68a5SMark Adams   PetscErrorCode     ierr;
12108f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1211076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1212076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1213076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12148f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
12158f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
12168f7e8f9dSMark Adams   PetscContainer     container;
1217930e68a5SMark Adams 
1218930e68a5SMark Adams   PetscFunctionBegin;
1219c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
12208f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
12218f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
12228f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
12238f7e8f9dSMark Adams   if (container) {
12248f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
12258f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
12268f7e8f9dSMark Adams     Nf = (*pNf)%1000;
12278f7e8f9dSMark Adams     nv = (*pNf)/1000;
12288f7e8f9dSMark Adams     if (nv>0) nVec = nv;
12298f7e8f9dSMark Adams   } else Nf = 1;
1230c0aa6a63SJacob Faibussowitsch   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %" PetscInt_FMT " %" PetscInt_FMT,n,Nf);
12318f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12328f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12338f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12348f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12358f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12368f7e8f9dSMark Adams   {
12378f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12388f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12398f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12408f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12418f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12428f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12438f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12448f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12458f7e8f9dSMark Adams     size_t flops_h = 0.0;
12468f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12478f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12488f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12498f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12508f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
12518f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
12528f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
12538f7e8f9dSMark Adams     // Fill A --> fact
12548f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1255042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
12568f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
12578f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
12588f7e8f9dSMark Adams         // zero rows of B
12598f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12608f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
12618f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
12628f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
12638f7e8f9dSMark Adams             /* zero (unfactored row) */
12648f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
12658f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
12668f7e8f9dSMark Adams           });
12678f7e8f9dSMark Adams         // copy A into B
12688f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12698f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
12708f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
12718f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
12728f7e8f9dSMark Adams             /* load in initial (unfactored row) */
12738f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
12748f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
12758f7e8f9dSMark Adams               PetscScalar vala = av[j];
12768f7e8f9dSMark Adams               if (colb == rowb) {
12778f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
12788f7e8f9dSMark Adams               } else {
12798f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12808f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12818f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
12828f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
12838f7e8f9dSMark Adams                   if (pbj[j] == colb) {
12848f7e8f9dSMark Adams                     pba[j] = vala;
12858f7e8f9dSMark Adams                     set++;
12868f7e8f9dSMark Adams                     break;
12878f7e8f9dSMark Adams                   }
12888f7e8f9dSMark Adams                 }
12898f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
12908f7e8f9dSMark Adams               }
12918f7e8f9dSMark Adams             }
12928f7e8f9dSMark Adams           });
12938f7e8f9dSMark Adams       });
12948f7e8f9dSMark Adams     Kokkos::fence();
1295930e68a5SMark Adams 
12968f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
12978f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12988f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12998f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1300042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13018f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13028f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13038f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13048f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13058f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13068f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13078f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13088f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13098f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13108f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13118f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13128f7e8f9dSMark Adams               else {
13138f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13148f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13158f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13168f7e8f9dSMark Adams                 size_t         st_idx;
13178f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13188f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13198f7e8f9dSMark Adams                 // get column, there has got to be a better way
13208f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13218f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13228f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13238f7e8f9dSMark Adams                       idx = j; // output
13248f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13258f7e8f9dSMark Adams                     }
13268f7e8f9dSMark Adams                 }, st_idx);
13278f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
132899551766SMark Adams #if defined(PETSC_USE_DEBUG)
132999551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
133099551766SMark Adams #endif
133199551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13328f7e8f9dSMark Adams                 // U(i+1,:end)
13338f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13348f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13358f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13368f7e8f9dSMark Adams                       if (col==myk) {
13378f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13388f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13398f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13408f7e8f9dSMark Adams                       } else {
13418f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13428f7e8f9dSMark Adams                         PetscInt       high, low;
13438f7e8f9dSMark Adams                         const PetscInt *startj;
13448f7e8f9dSMark Adams                         if (col<myk) { // L
13458f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13468f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13478f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13488f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
13498f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
13508f7e8f9dSMark Adams                         } else {
13518f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
13528f7e8f9dSMark Adams                           start = ba_d + idx;
13538f7e8f9dSMark Adams                           startj= bj_d + idx;
13548f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
13558f7e8f9dSMark Adams                         }
13568f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
13578f7e8f9dSMark Adams                         low  = 0;
13588f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
13598f7e8f9dSMark Adams                         while (high-low > 5) {
13608f7e8f9dSMark Adams                           int t = (low+high)/2;
13618f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
13628f7e8f9dSMark Adams                           else                 low  = t;
13638f7e8f9dSMark Adams                         }
13648f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
13658f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
13668f7e8f9dSMark Adams                         }
136799551766SMark Adams #if defined(PETSC_USE_DEBUG)
136899551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
136999551766SMark Adams #endif
13708f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
13718f7e8f9dSMark Adams                       }
13728f7e8f9dSMark Adams                     });
13738f7e8f9dSMark Adams               }
13748f7e8f9dSMark Adams             });
13758f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
13768f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
13778f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
13788f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
13798f7e8f9dSMark Adams       });
13808f7e8f9dSMark Adams     Kokkos::fence();
13818f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
13828f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13838f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
13848f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
13858f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
13868f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
13878f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
13888f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
13898f7e8f9dSMark Adams             if (i < end) {
13908f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
13918f7e8f9dSMark Adams               *pv = 1.0/(*pv);
13928f7e8f9dSMark Adams             }
13938f7e8f9dSMark Adams           });
13948f7e8f9dSMark Adams       });
13958f7e8f9dSMark Adams   }
13968f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
13978f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
13988f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
13998f7e8f9dSMark Adams 
14008f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
14018f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
14028f7e8f9dSMark Adams   if (b->inode.size) {
14038f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14048f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14058f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14068f7e8f9dSMark Adams   } else {
14078f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14088f7e8f9dSMark Adams   }
14098f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
14108f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
14118f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14128f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14138f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14148f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14158f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14168f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14178f7e8f9dSMark Adams 
1418930e68a5SMark Adams   PetscFunctionReturn(0);
1419930e68a5SMark Adams }
1420930e68a5SMark Adams 
142186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1422930e68a5SMark Adams {
1423930e68a5SMark Adams   PetscErrorCode   ierr;
1424930e68a5SMark Adams 
1425930e68a5SMark Adams   PetscFunctionBegin;
1426930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
142786a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
142886a27549SJunchao Zhang   PetscFunctionReturn(0);
142986a27549SJunchao Zhang }
143086a27549SJunchao Zhang 
143186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
143286a27549SJunchao Zhang {
143386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
143486a27549SJunchao Zhang 
143586a27549SJunchao Zhang   PetscFunctionBegin;
143686a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
143786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
143886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
143986a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
144086a27549SJunchao Zhang   }
144186a27549SJunchao Zhang   PetscFunctionReturn(0);
144286a27549SJunchao Zhang }
144386a27549SJunchao Zhang 
144486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
144586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
144686a27549SJunchao Zhang {
144786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1448076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
144986a27549SJunchao Zhang 
145086a27549SJunchao Zhang   PetscFunctionBegin;
145186a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
145286a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1453076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
145486a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1455076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1456076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
145786a27549SJunchao Zhang 
145886a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1459076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1460076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1461076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
146286a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
146386a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
146486a27549SJunchao Zhang 
146586a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
146686a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
146786a27549SJunchao Zhang     */
1468076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1469076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
147086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
147186a27549SJunchao Zhang 
147286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
147386a27549SJunchao Zhang 
147486a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1475076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
147686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1477076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1478076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
147986a27549SJunchao Zhang 
148086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1481076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1482076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1483076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
148486a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
148586a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
148686a27549SJunchao Zhang 
148786a27549SJunchao Zhang     /* Sort indices. See comments above */
1488076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1489076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
149086a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
149186a27549SJunchao Zhang 
149286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
149386a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
149486a27549SJunchao Zhang   }
149586a27549SJunchao Zhang   PetscFunctionReturn(0);
149686a27549SJunchao Zhang }
149786a27549SJunchao Zhang 
149886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
149986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
150086a27549SJunchao Zhang {
150186a27549SJunchao Zhang   PetscErrorCode                 ierr;
150286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
150386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
150486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
150586a27549SJunchao Zhang 
150686a27549SJunchao Zhang   PetscFunctionBegin;
1507*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
150886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
150986a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
151086a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
151186a27549SJunchao Zhang   /* Solve L tmpv = b */
15123f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
151386a27549SJunchao Zhang   /* Solve Ux = tmpv */
15143f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
151586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
151686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1517*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
151886a27549SJunchao Zhang   PetscFunctionReturn(0);
151986a27549SJunchao Zhang }
152086a27549SJunchao Zhang 
1521076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
152286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
152386a27549SJunchao Zhang {
152486a27549SJunchao Zhang   PetscErrorCode                 ierr;
152586a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
152686a27549SJunchao Zhang   PetscScalarKokkosView          xv;
152786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
152886a27549SJunchao Zhang 
152986a27549SJunchao Zhang   PetscFunctionBegin;
1530*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
153186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
153286a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
153386a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
153486a27549SJunchao Zhang   /* Solve U^T tmpv = b */
153586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
153686a27549SJunchao Zhang 
153786a27549SJunchao Zhang   /* Solve L^T x = tmpv */
153886a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
153986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
154086a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1541*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
154286a27549SJunchao Zhang   PetscFunctionReturn(0);
154386a27549SJunchao Zhang }
154486a27549SJunchao Zhang 
154586a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
154686a27549SJunchao Zhang {
154786a27549SJunchao Zhang   PetscErrorCode                 ierr;
154886a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
154986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
155086a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
155186a27549SJunchao Zhang 
155286a27549SJunchao Zhang   PetscFunctionBegin;
1553*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155486a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1555076ba34aSJunchao Zhang 
1556076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1557076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1558076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1559076ba34aSJunchao Zhang 
1560076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
156186a27549SJunchao Zhang 
156286a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
156386a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
156486a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
156586a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
156686a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
156786a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
156886a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
156986a27549SJunchao Zhang 
157086a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
157186a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
157286a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1573*eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1574*eeadb341SJunchao Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
157586a27549SJunchao Zhang   PetscFunctionReturn(0);
157686a27549SJunchao Zhang }
157786a27549SJunchao Zhang 
157886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
157986a27549SJunchao Zhang {
158086a27549SJunchao Zhang   PetscErrorCode                 ierr;
158186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
158286a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
158386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
158486a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
158586a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
158686a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
158786a27549SJunchao Zhang 
158886a27549SJunchao Zhang   PetscFunctionBegin;
158986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
159086a27549SJunchao Zhang   /* Rebuild factors */
159186a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
159286a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
159386a27549SJunchao Zhang 
159486a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
159586a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
159686a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
159786a27549SJunchao Zhang 
159886a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
159986a27549SJunchao Zhang 
160086a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
160186a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
160286a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
160386a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
160486a27549SJunchao Zhang 
160586a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1606076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1607076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1608076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
160986a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
161086a27549SJunchao Zhang 
161186a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
161286a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
161386a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
161486a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
161586a27549SJunchao Zhang 
161686a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
161786a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
161886a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
161986a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
162086a27549SJunchao Zhang  #else
162186a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
162286a27549SJunchao Zhang  #endif
162386a27549SJunchao Zhang 
162486a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
162586a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
162686a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
162786a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
162886a27549SJunchao Zhang 
162986a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
163086a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
163186a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
163286a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
163386a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
163486a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
163586a27549SJunchao Zhang 
163686a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
163786a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1638930e68a5SMark Adams   PetscFunctionReturn(0);
1639930e68a5SMark Adams }
1640930e68a5SMark Adams 
16418f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16428f7e8f9dSMark Adams {
16438f7e8f9dSMark Adams   PetscErrorCode   ierr;
16448f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16458f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1646930e68a5SMark Adams 
16478f7e8f9dSMark Adams   PetscFunctionBegin;
16488f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
16498f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16508f7e8f9dSMark Adams   // move B data into Kokkos
16518f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
16528f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
16538f7e8f9dSMark Adams   {
16548f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
16558f7e8f9dSMark Adams     if (!baijkok->diag_d) {
16568f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1657152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
16588f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
16598f7e8f9dSMark Adams     }
16608f7e8f9dSMark Adams   }
16618f7e8f9dSMark Adams   PetscFunctionReturn(0);
16628f7e8f9dSMark Adams }
16638f7e8f9dSMark Adams 
166486a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1665930e68a5SMark Adams {
1666930e68a5SMark Adams   PetscFunctionBegin;
1667930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1668930e68a5SMark Adams   PetscFunctionReturn(0);
1669930e68a5SMark Adams }
1670930e68a5SMark Adams 
16718f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
16728f7e8f9dSMark Adams {
16738f7e8f9dSMark Adams   PetscFunctionBegin;
16748f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
16758f7e8f9dSMark Adams   PetscFunctionReturn(0);
16768f7e8f9dSMark Adams }
16778f7e8f9dSMark Adams 
1678930e68a5SMark Adams /*MC
167986a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
168086a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1681930e68a5SMark Adams 
1682930e68a5SMark Adams   Level: beginner
1683930e68a5SMark Adams 
168486a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1685930e68a5SMark Adams M*/
168686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1687930e68a5SMark Adams {
1688930e68a5SMark Adams   PetscErrorCode ierr;
1689930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1690930e68a5SMark Adams 
1691930e68a5SMark Adams   PetscFunctionBegin;
1692930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1693930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1694930e68a5SMark Adams   (*B)->factortype = ftype;
16954ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1696930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1697930e68a5SMark Adams 
16988f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1699930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170086a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
170186a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
170286a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
170386a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
170486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
170586a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
170686a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1707930e68a5SMark Adams 
1708930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
170986a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1710930e68a5SMark Adams   PetscFunctionReturn(0);
1711930e68a5SMark Adams }
17128f7e8f9dSMark Adams 
17138f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17148f7e8f9dSMark Adams {
17158f7e8f9dSMark Adams   PetscErrorCode ierr;
17168f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17178f7e8f9dSMark Adams 
17188f7e8f9dSMark Adams   PetscFunctionBegin;
17198f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
17208f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
17218f7e8f9dSMark Adams   (*B)->factortype = ftype;
1722f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
17234ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
17248f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17258f7e8f9dSMark Adams 
17268f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17278f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17288f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17298f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17308f7e8f9dSMark Adams 
17318f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17328f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17338f7e8f9dSMark Adams   PetscFunctionReturn(0);
17348f7e8f9dSMark Adams }
173586a27549SJunchao Zhang 
173686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
173786a27549SJunchao Zhang {
173886a27549SJunchao Zhang   PetscErrorCode ierr;
173986a27549SJunchao Zhang 
174086a27549SJunchao Zhang   PetscFunctionBegin;
174186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
174286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
174386a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
174486a27549SJunchao Zhang   PetscFunctionReturn(0);
174586a27549SJunchao Zhang }
174686a27549SJunchao Zhang 
1747076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1748076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1749076ba34aSJunchao Zhang {
1750076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1751076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1752076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1753076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1754076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1755076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1756076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1757076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1758076ba34aSJunchao Zhang 
1759076ba34aSJunchao Zhang   PetscFunctionBegin;
1760c0aa6a63SJacob Faibussowitsch   ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1761076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1762c0aa6a63SJacob Faibussowitsch     ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr);
1763076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1764c0aa6a63SJacob Faibussowitsch       ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1765076ba34aSJunchao Zhang     }
1766076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1767076ba34aSJunchao Zhang   }
1768076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1769076ba34aSJunchao Zhang }
1770