xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 0ecb592a4e0be56a995dc4f68c381b34ca32ece0)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
48c3ff71bSJunchao Zhang #include <petscsystypes.h>
58c3ff71bSJunchao Zhang #include <petscerror.h>
68c3ff71bSJunchao Zhang 
78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
8f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
6186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
7986a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   PetscFunctionReturn(0);
8686a27549SJunchao Zhang }
8786a27549SJunchao Zhang 
88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89f0cf5187SStefano Zampini {
90f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91f0cf5187SStefano Zampini 
92f0cf5187SStefano Zampini   PetscFunctionBegin;
93f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9486a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9586a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
98f0cf5187SStefano Zampini   PetscFunctionReturn(0);
99f0cf5187SStefano Zampini }
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102f0cf5187SStefano Zampini {
103076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104f0cf5187SStefano Zampini 
105f0cf5187SStefano Zampini   PetscFunctionBegin;
106076ba34aSJunchao Zhang   if (aijkok) {
107076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
108076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
109076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
110076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111076ba34aSJunchao Zhang   }
112076ba34aSJunchao Zhang   PetscFunctionReturn(0);
113076ba34aSJunchao Zhang }
114076ba34aSJunchao Zhang 
115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116076ba34aSJunchao Zhang {
117076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118076ba34aSJunchao Zhang 
119076ba34aSJunchao Zhang   PetscFunctionBegin;
120076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
121076ba34aSJunchao Zhang   PetscFunctionReturn(0);
122076ba34aSJunchao Zhang }
123076ba34aSJunchao Zhang 
124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125076ba34aSJunchao Zhang {
126076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127076ba34aSJunchao Zhang 
128076ba34aSJunchao Zhang   PetscFunctionBegin;
129076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
130076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
131076ba34aSJunchao Zhang   PetscFunctionReturn(0);
132076ba34aSJunchao Zhang }
133076ba34aSJunchao Zhang 
134076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135076ba34aSJunchao Zhang {
136076ba34aSJunchao Zhang   PetscFunctionBegin;
137076ba34aSJunchao Zhang   *array = NULL;
138076ba34aSJunchao Zhang   PetscFunctionReturn(0);
139076ba34aSJunchao Zhang }
140076ba34aSJunchao Zhang 
141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142076ba34aSJunchao Zhang {
143076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144076ba34aSJunchao Zhang 
145076ba34aSJunchao Zhang   PetscFunctionBegin;
146076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
147076ba34aSJunchao Zhang   PetscFunctionReturn(0);
148076ba34aSJunchao Zhang }
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151076ba34aSJunchao Zhang {
152076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153076ba34aSJunchao Zhang 
154076ba34aSJunchao Zhang   PetscFunctionBegin;
155076ba34aSJunchao Zhang   aijkok->a_dual.clear_sync_state();
156076ba34aSJunchao Zhang   aijkok->a_dual.modify_host();
157f0cf5187SStefano Zampini   PetscFunctionReturn(0);
158f0cf5187SStefano Zampini }
159f0cf5187SStefano Zampini 
160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162a587d139SMark {
163042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165a587d139SMark 
166a587d139SMark   PetscFunctionBegin;
167076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170a587d139SMark   PetscFunctionReturn(0);
171a587d139SMark }
172a587d139SMark 
173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175a587d139SMark {
176042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
179a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
180a587d139SMark     *d_mat = aijkok->device_mat_d.data();
181a587d139SMark   } else {
182a587d139SMark     PetscErrorCode   ierr;
183a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184a587d139SMark     *d_mat  = NULL;
185a587d139SMark   }
186a587d139SMark   PetscFunctionReturn(0);
187a587d139SMark }
188076ba34aSJunchao Zhang 
189076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
190076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191152b3e56SJunchao Zhang {
192152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193152b3e56SJunchao Zhang 
194152b3e56SJunchao Zhang   PetscFunctionBegin;
195076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
199076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
20186a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
202076ba34aSJunchao Zhang   }
203076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
204152b3e56SJunchao Zhang   PetscFunctionReturn(0);
205152b3e56SJunchao Zhang }
206152b3e56SJunchao Zhang 
207076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
208076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209152b3e56SJunchao Zhang {
210152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211152b3e56SJunchao Zhang 
212152b3e56SJunchao Zhang   PetscFunctionBegin;
213076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
216076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
219076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
220076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221076ba34aSJunchao Zhang    #endif
22286a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
223076ba34aSJunchao Zhang   }
224076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
225152b3e56SJunchao Zhang   PetscFunctionReturn(0);
226152b3e56SJunchao Zhang }
227a587d139SMark 
2288c3ff71bSJunchao Zhang /* y = A x */
2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2308c3ff71bSJunchao Zhang {
2318c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
233152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
234152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2358c3ff71bSJunchao Zhang 
2368c3ff71bSJunchao Zhang   PetscFunctionBegin;
2378c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
245076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2478c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2488c3ff71bSJunchao Zhang }
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang /* y = A^T x */
2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2528c3ff71bSJunchao Zhang {
2538c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2548c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
255152b3e56SJunchao Zhang   const char                       *mode;
256152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
257152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
258076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2598c3ff71bSJunchao Zhang 
2608c3ff71bSJunchao Zhang   PetscFunctionBegin;
2618c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
265076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266152b3e56SJunchao Zhang     mode = "N";
267152b3e56SJunchao Zhang   } else {
268076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
270152b3e56SJunchao Zhang     mode = "T";
271152b3e56SJunchao Zhang   }
272076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
276076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2788c3ff71bSJunchao Zhang }
2798c3ff71bSJunchao Zhang 
2808c3ff71bSJunchao Zhang /* y = A^H x */
2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2828c3ff71bSJunchao Zhang {
2838c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
285152b3e56SJunchao Zhang   const char                       *mode;
286152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
287152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
288076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
2918c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
295076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296152b3e56SJunchao Zhang     mode = "N";
297152b3e56SJunchao Zhang   } else {
298076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
300152b3e56SJunchao Zhang     mode = "C";
301152b3e56SJunchao Zhang   }
302076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
306076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3088c3ff71bSJunchao Zhang }
3098c3ff71bSJunchao Zhang 
3108c3ff71bSJunchao Zhang /* z = A x + y */
3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3128c3ff71bSJunchao Zhang {
3138c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3148c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
315152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
316152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3178c3ff71bSJunchao Zhang 
3188c3ff71bSJunchao Zhang   PetscFunctionBegin;
3198c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3248c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
330152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3318c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3328c3ff71bSJunchao Zhang }
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang /* z = A^T x + y */
3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3368c3ff71bSJunchao Zhang {
3378c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3388c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
339152b3e56SJunchao Zhang   const char                       *mode;
340152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
341152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
342076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3438c3ff71bSJunchao Zhang 
3448c3ff71bSJunchao Zhang   PetscFunctionBegin;
3458c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
350152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
351076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352152b3e56SJunchao Zhang     mode = "N";
353152b3e56SJunchao Zhang   } else {
354076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
356152b3e56SJunchao Zhang     mode = "T";
357152b3e56SJunchao Zhang   }
358076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
363076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3658c3ff71bSJunchao Zhang }
3668c3ff71bSJunchao Zhang 
3678c3ff71bSJunchao Zhang /* z = A^H x + y */
3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3698c3ff71bSJunchao Zhang {
3708c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3718c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
372152b3e56SJunchao Zhang   const char                       *mode;
373152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
374152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
375076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3768c3ff71bSJunchao Zhang 
3778c3ff71bSJunchao Zhang   PetscFunctionBegin;
3788c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
383152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
384076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385152b3e56SJunchao Zhang     mode = "N";
386152b3e56SJunchao Zhang   } else {
387076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
389152b3e56SJunchao Zhang     mode = "C";
390152b3e56SJunchao Zhang   }
391076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
396076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397152b3e56SJunchao Zhang   PetscFunctionReturn(0);
398152b3e56SJunchao Zhang }
399152b3e56SJunchao Zhang 
400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401152b3e56SJunchao Zhang {
402152b3e56SJunchao Zhang   PetscErrorCode            ierr;
403152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404152b3e56SJunchao Zhang 
405152b3e56SJunchao Zhang   PetscFunctionBegin;
406152b3e56SJunchao Zhang   switch (op) {
407152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
408152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
411152b3e56SJunchao Zhang       break;
412152b3e56SJunchao Zhang     default:
413152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414152b3e56SJunchao Zhang       break;
415152b3e56SJunchao Zhang   }
4168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4178c3ff71bSJunchao Zhang }
4188c3ff71bSJunchao Zhang 
419076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4218c3ff71bSJunchao Zhang {
4228c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
423076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4248c3ff71bSJunchao Zhang 
4258c3ff71bSJunchao Zhang   PetscFunctionBegin;
426a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4298c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4418c3ff71bSJunchao Zhang     }
442076ba34aSJunchao Zhang   }
4438c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4448c3ff71bSJunchao Zhang }
4458c3ff71bSJunchao Zhang 
446076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448076ba34aSJunchao Zhang  */
449076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4508c3ff71bSJunchao Zhang {
4518c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
452076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
453076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454076ba34aSJunchao Zhang   Mat                   mat;
4558c3ff71bSJunchao Zhang 
4568c3ff71bSJunchao Zhang   PetscFunctionBegin;
457076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459076ba34aSJunchao Zhang   mat  = *B;
460076ba34aSJunchao Zhang   if (A->assembled) {
461076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464076ba34aSJunchao Zhang     /* Now copy values to B if needed */
465076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
466076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
467076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
469076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
470076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
472076ba34aSJunchao Zhang       }
473076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
476076ba34aSJunchao Zhang     }
477076ba34aSJunchao Zhang     mat->spptr = bkok;
478076ba34aSJunchao Zhang   }
479076ba34aSJunchao Zhang 
480076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
4848c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4858c3ff71bSJunchao Zhang }
4868c3ff71bSJunchao Zhang 
487*0ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
488*0ecb592aSJunchao Zhang {
489*0ecb592aSJunchao Zhang   PetscErrorCode    ierr;
490*0ecb592aSJunchao Zhang   Mat               At;
491*0ecb592aSJunchao Zhang   KokkosCsrMatrix   *internT,*csrmatT;
492*0ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
493*0ecb592aSJunchao Zhang 
494*0ecb592aSJunchao Zhang   PetscFunctionBegin;
495*0ecb592aSJunchao Zhang   ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */
496*0ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
497*0ecb592aSJunchao Zhang     CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */
498*0ecb592aSJunchao Zhang     CHKERRCXX(atkok   = new Mat_SeqAIJKokkos(*csrmatT));
499*0ecb592aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr);
500*0ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
501*0ecb592aSJunchao Zhang     else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */
502*0ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
503*0ecb592aSJunchao Zhang     if ((*B)->assembled) {
504*0ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
505*0ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
506*0ecb592aSJunchao Zhang       ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr);
507*0ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
508*0ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
509*0ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
510*0ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
511*0ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
512*0ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
513*0ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
514*0ecb592aSJunchao Zhang   }
515*0ecb592aSJunchao Zhang   PetscFunctionReturn(0);
516*0ecb592aSJunchao Zhang }
517*0ecb592aSJunchao Zhang 
5188c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5198c3ff71bSJunchao Zhang {
5208c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
52186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5228c3ff71bSJunchao Zhang 
5238c3ff71bSJunchao Zhang   PetscFunctionBegin;
52486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
52586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5268f7e8f9dSMark Adams     if (aijkok) {
5278f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
528a587d139SMark         delete aijkok->colmap_d;
529a587d139SMark         delete aijkok->i_uncompressed_d;
530a587d139SMark       }
5318f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5328f7e8f9dSMark Adams     }
5338c3ff71bSJunchao Zhang     delete aijkok;
53486a27549SJunchao Zhang   } else {
53586a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
53686a27549SJunchao Zhang   }
537152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
538152b3e56SJunchao Zhang   A->spptr = NULL;
5398c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5408c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5418c3ff71bSJunchao Zhang }
5428c3ff71bSJunchao Zhang 
54386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
54486a27549SJunchao Zhang {
54586a27549SJunchao Zhang   PetscErrorCode ierr;
54686a27549SJunchao Zhang 
54786a27549SJunchao Zhang   PetscFunctionBegin;
54886a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
54986a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
55086a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
55186a27549SJunchao Zhang   PetscFunctionReturn(0);
55286a27549SJunchao Zhang }
55386a27549SJunchao Zhang 
554076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
555076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
556a3f881fbSStefano Zampini {
557076ba34aSJunchao Zhang   PetscErrorCode               ierr;
558076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
559076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
560076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
561076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
562076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
563076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
564a3f881fbSStefano Zampini 
565a3f881fbSStefano Zampini   PetscFunctionBegin;
566076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
567076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
568076ba34aSJunchao Zhang   PetscValidPointer(C,4);
569076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
570076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
571076ba34aSJunchao Zhang   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
572076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
573076ba34aSJunchao Zhang 
574076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
575076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
576076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
577076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
578076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
579076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
580076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
581076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
582076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
583076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
584076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
585076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
586076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
587076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
588076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
589076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
590076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
591076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
592076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
593076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
594076ba34aSJunchao Zhang     ca = ca_dual.view_device();
595076ba34aSJunchao Zhang     ci = ci_dual.view_device();
596076ba34aSJunchao Zhang     cj = cj_dual.view_device();
597076ba34aSJunchao Zhang 
598076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
599076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
601076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602076ba34aSJunchao Zhang 
603076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
604076ba34aSJunchao Zhang         ci(i) = coffset;
605076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
606076ba34aSJunchao Zhang       });
607076ba34aSJunchao Zhang 
608076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
609076ba34aSJunchao Zhang         if (k < alen) {
610076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
611076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
612076ba34aSJunchao Zhang         } else {
613076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
614076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
615076ba34aSJunchao Zhang         }
616076ba34aSJunchao Zhang       });
617076ba34aSJunchao Zhang     });
618076ba34aSJunchao Zhang     ca_dual.modify_device();
619076ba34aSJunchao Zhang     ci_dual.modify_device();
620076ba34aSJunchao Zhang     cj_dual.modify_device();
621076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
622076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
623076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
624076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
625076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
626076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
627076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
628076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
629076ba34aSJunchao Zhang 
630076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
631076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
632076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
633076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
634076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
635076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
636076ba34aSJunchao Zhang       });
637076ba34aSJunchao Zhang     });
638076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
639076ba34aSJunchao Zhang   }
640076ba34aSJunchao Zhang   PetscFunctionReturn(0);
641076ba34aSJunchao Zhang }
642076ba34aSJunchao Zhang 
643076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
644076ba34aSJunchao Zhang {
645076ba34aSJunchao Zhang   PetscFunctionBegin;
646076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
647a3f881fbSStefano Zampini   PetscFunctionReturn(0);
648a3f881fbSStefano Zampini }
649a3f881fbSStefano Zampini 
650a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
651a3f881fbSStefano Zampini {
652076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
653a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
654a3f881fbSStefano Zampini   Mat                            A,B;
655076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
656a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
657a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
658076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
659076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
660a3f881fbSStefano Zampini 
661a3f881fbSStefano Zampini   PetscFunctionBegin;
662a3f881fbSStefano Zampini   MatCheckProduct(C,1);
663a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
664076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
665076ba34aSJunchao Zhang 
666076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
667076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
668076ba34aSJunchao Zhang     PetscFunctionReturn(0);
669076ba34aSJunchao Zhang   }
670076ba34aSJunchao Zhang 
671076ba34aSJunchao Zhang   switch (product->type) {
672076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
673076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
674076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
675076ba34aSJunchao Zhang     default:
676076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
677076ba34aSJunchao Zhang   }
678076ba34aSJunchao Zhang 
679a3f881fbSStefano Zampini   A     = product->A;
680a3f881fbSStefano Zampini   B     = product->B;
681a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
682a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
683a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
684a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
685a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
686076ba34aSJunchao Zhang 
687076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
688076ba34aSJunchao Zhang 
689076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
690076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
691076ba34aSJunchao Zhang 
692076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
693076ba34aSJunchao Zhang   if (transA) {
694076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
695076ba34aSJunchao Zhang     transA = false;
696a3f881fbSStefano Zampini   }
697a3f881fbSStefano Zampini 
698076ba34aSJunchao Zhang   if (transB) {
699076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
700076ba34aSJunchao Zhang     transB = false;
701076ba34aSJunchao Zhang   }
702076ba34aSJunchao Zhang 
703076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
704076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
705076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
706a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
707a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
708a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
709a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
710a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
711a3f881fbSStefano Zampini   c->reallocs         = 0;
712076ba34aSJunchao Zhang   C->info.mallocs     = 0;
713a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
714a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
715a3f881fbSStefano Zampini   C->num_ass++;
716a3f881fbSStefano Zampini   PetscFunctionReturn(0);
717a3f881fbSStefano Zampini }
718a3f881fbSStefano Zampini 
719a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
720a3f881fbSStefano Zampini {
721a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
722076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
723076ba34aSJunchao Zhang   MatProductType                 ptype;
724076ba34aSJunchao Zhang   Mat                            A,B;
725076ba34aSJunchao Zhang   bool                           transA,transB;
726076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
727076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
728076ba34aSJunchao Zhang   MPI_Comm                       comm;
729076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
730a3f881fbSStefano Zampini 
731a3f881fbSStefano Zampini   PetscFunctionBegin;
732a3f881fbSStefano Zampini   MatCheckProduct(C,1);
733076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
734076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
735a3f881fbSStefano Zampini   A       = product->A;
736a3f881fbSStefano Zampini   B       = product->B;
737a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
738a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
739a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
740a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
741076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
742076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
743076ba34aSJunchao Zhang 
744a3f881fbSStefano Zampini   ptype   = product->type;
745a3f881fbSStefano Zampini   switch (ptype) {
746076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
747076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
748076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
749a3f881fbSStefano Zampini     default:
750076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
751a3f881fbSStefano Zampini   }
752a3f881fbSStefano Zampini 
753076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
754076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
755076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
756076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
757a3f881fbSStefano Zampini 
758076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
759076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
760076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
761076ba34aSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
762076ba34aSJunchao Zhang     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
763076ba34aSJunchao Zhang     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
764076ba34aSJunchao Zhang   #endif
765076ba34aSJunchao Zhang #endif
766076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
767076ba34aSJunchao Zhang 
768076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
769076ba34aSJunchao Zhang   if (transA) {
770076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
771076ba34aSJunchao Zhang     transA = false;
772076ba34aSJunchao Zhang   }
773076ba34aSJunchao Zhang 
774076ba34aSJunchao Zhang   if (transB) {
775076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
776076ba34aSJunchao Zhang     transB = false;
777076ba34aSJunchao Zhang   }
778076ba34aSJunchao Zhang 
779076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
780076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
781076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
782076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
783076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
784076ba34aSJunchao Zhang   */
785076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
786076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
787076ba34aSJunchao Zhang 
788076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
789076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
790076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
791a3f881fbSStefano Zampini   PetscFunctionReturn(0);
792a3f881fbSStefano Zampini }
793a3f881fbSStefano Zampini 
794a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
795076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
796a3f881fbSStefano Zampini {
797a3f881fbSStefano Zampini   PetscErrorCode ierr;
798076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
799a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
800a3f881fbSStefano Zampini 
801a3f881fbSStefano Zampini   PetscFunctionBegin;
802a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
803a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
804a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
805a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
806a3f881fbSStefano Zampini   }
807a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
808a3f881fbSStefano Zampini     switch (product->type) {
809a3f881fbSStefano Zampini     case MATPRODUCT_AB:
810a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
811a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
812a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
813a3f881fbSStefano Zampini       break;
814a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
815a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
816a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
817a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
818a3f881fbSStefano Zampini       break;
819a3f881fbSStefano Zampini     default:
820a3f881fbSStefano Zampini       break;
821a3f881fbSStefano Zampini     }
822a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
823a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
824a3f881fbSStefano Zampini   }
825a3f881fbSStefano Zampini   PetscFunctionReturn(0);
826a3f881fbSStefano Zampini }
827a587d139SMark 
828f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
829f0cf5187SStefano Zampini {
830f0cf5187SStefano Zampini   PetscErrorCode   ierr;
831f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
832f0cf5187SStefano Zampini 
833f0cf5187SStefano Zampini   PetscFunctionBegin;
834f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
835f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
836076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
837076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
838f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
839076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
840f0cf5187SStefano Zampini   PetscFunctionReturn(0);
841f0cf5187SStefano Zampini }
842f0cf5187SStefano Zampini 
843a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
844a587d139SMark {
845a587d139SMark   PetscErrorCode   ierr;
846076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
847a587d139SMark 
848a587d139SMark   PetscFunctionBegin;
849076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
850076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
851076ba34aSJunchao Zhang   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
852076ba34aSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
853076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
854a587d139SMark   PetscFunctionReturn(0);
855a587d139SMark }
856a587d139SMark 
857db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
858db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
859db78de30SJunchao Zhang {
860db78de30SJunchao Zhang   PetscErrorCode     ierr;
861db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
862db78de30SJunchao Zhang 
863db78de30SJunchao Zhang   PetscFunctionBegin;
864db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
865db78de30SJunchao Zhang   PetscValidPointer(kv,2);
866db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
867db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
868db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
869076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
870db78de30SJunchao Zhang   PetscFunctionReturn(0);
871db78de30SJunchao Zhang }
872db78de30SJunchao Zhang 
873db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
874db78de30SJunchao Zhang {
875db78de30SJunchao Zhang   PetscFunctionBegin;
876db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
877db78de30SJunchao Zhang   PetscValidPointer(kv,2);
878db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
879db78de30SJunchao Zhang   PetscFunctionReturn(0);
880db78de30SJunchao Zhang }
881db78de30SJunchao Zhang 
882db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
883db78de30SJunchao Zhang {
884db78de30SJunchao Zhang   PetscErrorCode     ierr;
885db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
886db78de30SJunchao Zhang 
887db78de30SJunchao Zhang   PetscFunctionBegin;
888db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
889db78de30SJunchao Zhang   PetscValidPointer(kv,2);
890db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
891db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
892db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
893076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
894db78de30SJunchao Zhang   PetscFunctionReturn(0);
895db78de30SJunchao Zhang }
896db78de30SJunchao Zhang 
897db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
898db78de30SJunchao Zhang {
899db78de30SJunchao Zhang   PetscErrorCode     ierr;
900db78de30SJunchao Zhang 
901db78de30SJunchao Zhang   PetscFunctionBegin;
902db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
903db78de30SJunchao Zhang   PetscValidPointer(kv,2);
904db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
905076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
906db78de30SJunchao Zhang   PetscFunctionReturn(0);
907db78de30SJunchao Zhang }
908db78de30SJunchao Zhang 
909db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
910db78de30SJunchao Zhang {
911db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
912db78de30SJunchao Zhang 
913db78de30SJunchao Zhang   PetscFunctionBegin;
914db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
915db78de30SJunchao Zhang   PetscValidPointer(kv,2);
916db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
917db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
918076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
919db78de30SJunchao Zhang   PetscFunctionReturn(0);
920db78de30SJunchao Zhang }
921db78de30SJunchao Zhang 
922db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
923db78de30SJunchao Zhang {
924db78de30SJunchao Zhang   PetscErrorCode     ierr;
925db78de30SJunchao Zhang 
926db78de30SJunchao Zhang   PetscFunctionBegin;
927db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928db78de30SJunchao Zhang   PetscValidPointer(kv,2);
929db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
931db78de30SJunchao Zhang   PetscFunctionReturn(0);
932db78de30SJunchao Zhang }
933db78de30SJunchao Zhang 
934c17cf699SJunchao Zhang /* Computes Y += alpha X */
935c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
936a587d139SMark {
937a587d139SMark   PetscErrorCode             ierr;
938a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
939c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
940c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
941c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
942a587d139SMark 
943a587d139SMark   PetscFunctionBegin;
944c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
945c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
946c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
947c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
948db78de30SJunchao Zhang 
949c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
950c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
951a587d139SMark     PetscBool e;
952a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
953a587d139SMark     if (e) {
954a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
955c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
956a587d139SMark     }
957a587d139SMark   }
958db78de30SJunchao Zhang 
959c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
960c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
961c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
962c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
963c17cf699SJunchao Zhang   */
964c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
965c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
966c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
967c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
968c17cf699SJunchao Zhang 
969c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
970c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
971c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
972c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
973c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
974c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
975c17cf699SJunchao Zhang 
976c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
977c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
978c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
979c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
980c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
981c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
982c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
983c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
984c17cf699SJunchao Zhang             q++;
985a587d139SMark           } else {
986c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
987c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
988c17cf699SJunchao Zhang             */
989c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
990a587d139SMark           }
991c17cf699SJunchao Zhang         }
992c17cf699SJunchao Zhang       });
993c17cf699SJunchao Zhang     });
994c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
995c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
996c17cf699SJunchao Zhang     Mat             Z;
997c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
998c17cf699SJunchao Zhang     KernelHandle    kh;
999c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1000c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1001c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1002c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1003c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
1004c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
1005c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1006c17cf699SJunchao Zhang   }
1007c17cf699SJunchao Zhang 
1008a587d139SMark   PetscFunctionReturn(0);
1009a587d139SMark }
1010a587d139SMark 
101186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10128f7e8f9dSMark Adams {
10138f7e8f9dSMark Adams   PetscErrorCode ierr;
10148f7e8f9dSMark Adams 
10158f7e8f9dSMark Adams   PetscFunctionBegin;
10168f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
10178f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
10188f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10198f7e8f9dSMark Adams   PetscFunctionReturn(0);
10208f7e8f9dSMark Adams }
10218f7e8f9dSMark Adams 
10228c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
10238c3ff71bSJunchao Zhang {
1024076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1025076ba34aSJunchao Zhang 
10268c3ff71bSJunchao Zhang   PetscFunctionBegin;
1027076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
10286f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
10296f3d89d0SStefano Zampini 
10308c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
10318c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
10328c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1033a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1034f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1035a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1036076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
10378c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
10388c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
10398c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
10408c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
10418c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
10428c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1043076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1044*0ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1045152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1046076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1047076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1048076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1049076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1050076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1051076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1052076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1053076ba34aSJunchao Zhang }
1054076ba34aSJunchao Zhang 
1055076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1056076ba34aSJunchao Zhang {
1057076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1058076ba34aSJunchao Zhang   Mat_SeqAIJ         *aseq;
1059076ba34aSJunchao Zhang   PetscInt           i,m,n;
1060076ba34aSJunchao Zhang 
1061076ba34aSJunchao Zhang   PetscFunctionBegin;
1062076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1063076ba34aSJunchao Zhang 
1064076ba34aSJunchao Zhang   m    = akok->nrows();
1065076ba34aSJunchao Zhang   n    = akok->ncols();
1066076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1067076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1068076ba34aSJunchao Zhang 
1069076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1070076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1071076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1072076ba34aSJunchao Zhang 
1073076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1074076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1075076ba34aSJunchao Zhang 
1076076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1077076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1078076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1079076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1080076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1081076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1082076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1083076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1084076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1085076ba34aSJunchao Zhang 
1086076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1087076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1088076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1089076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1090076ba34aSJunchao Zhang   }
1091076ba34aSJunchao Zhang 
1092076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1093076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1094076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1095076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1096076ba34aSJunchao Zhang   A->spptr = akok;
1097076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1098076ba34aSJunchao Zhang }
1099076ba34aSJunchao Zhang 
1100076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1101076ba34aSJunchao Zhang 
1102076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1103076ba34aSJunchao Zhang  */
1104076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1105076ba34aSJunchao Zhang {
1106076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1107076ba34aSJunchao Zhang 
1108076ba34aSJunchao Zhang   PetscFunctionBegin;
1109076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1110076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
11118c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11128c3ff71bSJunchao Zhang }
11138c3ff71bSJunchao Zhang 
11148c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1115152b3e56SJunchao Zhang /*@C
11168c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
11178c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
11188c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
11198c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
11208c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
11218c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
11228c3ff71bSJunchao Zhang 
11238c3ff71bSJunchao Zhang    Collective
11248c3ff71bSJunchao Zhang 
11258c3ff71bSJunchao Zhang    Input Parameters:
11268c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
11278c3ff71bSJunchao Zhang .  m - number of rows
11288c3ff71bSJunchao Zhang .  n - number of columns
11298c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
11308c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
11318c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
11328c3ff71bSJunchao Zhang 
11338c3ff71bSJunchao Zhang    Output Parameter:
11348c3ff71bSJunchao Zhang .  A - the matrix
11358c3ff71bSJunchao Zhang 
11368c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
11378c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
11388c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
11398c3ff71bSJunchao Zhang 
11408c3ff71bSJunchao Zhang    Notes:
11418c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
11428c3ff71bSJunchao Zhang 
11438c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
11448c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
11458c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
11468c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
11478c3ff71bSJunchao Zhang 
11488c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
11498c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11508c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
11518c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
11528c3ff71bSJunchao Zhang 
11538c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
11548c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
11558c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
11568c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
11578c3ff71bSJunchao Zhang 
11588c3ff71bSJunchao Zhang    Level: intermediate
11598c3ff71bSJunchao Zhang 
1160076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
11618c3ff71bSJunchao Zhang @*/
11628c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11638c3ff71bSJunchao Zhang {
11648c3ff71bSJunchao Zhang   PetscErrorCode ierr;
11658c3ff71bSJunchao Zhang 
11668c3ff71bSJunchao Zhang   PetscFunctionBegin;
11678c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
11688c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11698c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11708c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11718c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11738c3ff71bSJunchao Zhang }
1174930e68a5SMark Adams 
11758f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
11768f7e8f9dSMark Adams //
11778f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
11788f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
11798f7e8f9dSMark Adams //
11808f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1181930e68a5SMark Adams {
11828f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
11838f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
11848f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1185930e68a5SMark Adams   PetscErrorCode     ierr;
11868f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1187076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1188076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1189076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
11908f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
11918f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
11928f7e8f9dSMark Adams   PetscContainer     container;
1193930e68a5SMark Adams 
1194930e68a5SMark Adams   PetscFunctionBegin;
11958f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
11968f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
11978f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
11988f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
11998f7e8f9dSMark Adams   if (container) {
12008f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
12018f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
12028f7e8f9dSMark Adams     Nf = (*pNf)%1000;
12038f7e8f9dSMark Adams     nv = (*pNf)/1000;
12048f7e8f9dSMark Adams     if (nv>0) nVec = nv;
12058f7e8f9dSMark Adams   } else Nf = 1;
12068f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
12078f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
12088f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
12098f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
12108f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
12118f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
12128f7e8f9dSMark Adams #endif
12138f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
12148f7e8f9dSMark Adams   {
12158f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12168f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12178f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12188f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12198f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12208f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12218f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12228f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12238f7e8f9dSMark Adams     size_t flops_h = 0.0;
12248f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12258f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12268f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12278f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12288f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
12298f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
12308f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
12318f7e8f9dSMark Adams     // Fill A --> fact
12328f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1233042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
12348f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
12358f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
12368f7e8f9dSMark Adams         // zero rows of B
12378f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12388f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
12398f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
12408f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
12418f7e8f9dSMark Adams             /* zero (unfactored row) */
12428f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
12438f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
12448f7e8f9dSMark Adams           });
12458f7e8f9dSMark Adams         // copy A into B
12468f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12478f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
12488f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
12498f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
12508f7e8f9dSMark Adams             /* load in initial (unfactored row) */
12518f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
12528f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
12538f7e8f9dSMark Adams               PetscScalar vala = av[j];
12548f7e8f9dSMark Adams               if (colb == rowb) {
12558f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
12568f7e8f9dSMark Adams               } else {
12578f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12588f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12598f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
12608f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
12618f7e8f9dSMark Adams                   if (pbj[j] == colb) {
12628f7e8f9dSMark Adams                     pba[j] = vala;
12638f7e8f9dSMark Adams                     set++;
12648f7e8f9dSMark Adams                     break;
12658f7e8f9dSMark Adams                   }
12668f7e8f9dSMark Adams                 }
12678f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
12688f7e8f9dSMark Adams               }
12698f7e8f9dSMark Adams             }
12708f7e8f9dSMark Adams           });
12718f7e8f9dSMark Adams       });
12728f7e8f9dSMark Adams     Kokkos::fence();
1273930e68a5SMark Adams 
12748f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
12758f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12768f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12778f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1278042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
12798f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
12808f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
12818f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
12828f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
12838f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
12848f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
12858f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
12868f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
12878f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
12888f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
12898f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
12908f7e8f9dSMark Adams               else {
12918f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
12928f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
12938f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
12948f7e8f9dSMark Adams                 size_t         st_idx;
12958f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
12968f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
12978f7e8f9dSMark Adams                 // get column, there has got to be a better way
12988f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
12998f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
13008f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13018f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13028f7e8f9dSMark Adams                       idx = j; // output
13038f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13048f7e8f9dSMark Adams                     }
13058f7e8f9dSMark Adams                   }, st_idx);
13068f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13078f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
13088f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13098f7e8f9dSMark Adams                   // U(i+1,:end)
13108f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13118f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13128f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13138f7e8f9dSMark Adams                       if (col==myk) {
13148f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13158f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13168f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13178f7e8f9dSMark Adams                       } else {
13188f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13198f7e8f9dSMark Adams                         PetscInt       high, low;
13208f7e8f9dSMark Adams                         const PetscInt *startj;
13218f7e8f9dSMark Adams                         if (col<myk) { // L
13228f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13238f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13248f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13258f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
13268f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
13278f7e8f9dSMark Adams                         } else {
13288f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
13298f7e8f9dSMark Adams                           start = ba_d + idx;
13308f7e8f9dSMark Adams                           startj= bj_d + idx;
13318f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
13328f7e8f9dSMark Adams                         }
13338f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
13348f7e8f9dSMark Adams                         low  = 0;
13358f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
13368f7e8f9dSMark Adams                         while (high-low > 5) {
13378f7e8f9dSMark Adams                           int t = (low+high)/2;
13388f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
13398f7e8f9dSMark Adams                           else                 low  = t;
13408f7e8f9dSMark Adams                         }
13418f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
13428f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
13438f7e8f9dSMark Adams                         }
13448f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
13458f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
13468f7e8f9dSMark Adams                       }
13478f7e8f9dSMark Adams                     });
13488f7e8f9dSMark Adams                 }
13498f7e8f9dSMark Adams               }
13508f7e8f9dSMark Adams             });
13518f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
13528f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
13538f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
13548f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
13558f7e8f9dSMark Adams       });
13568f7e8f9dSMark Adams     Kokkos::fence();
13578f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
13588f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13598f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13608f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
13618f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13628f7e8f9dSMark Adams #endif
13638f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
13648f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
13658f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
13668f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
13678f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
13688f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
13698f7e8f9dSMark Adams             if (i < end) {
13708f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
13718f7e8f9dSMark Adams               *pv = 1.0/(*pv);
13728f7e8f9dSMark Adams             }
13738f7e8f9dSMark Adams           });
13748f7e8f9dSMark Adams       });
13758f7e8f9dSMark Adams   }
13768f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13778f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
13788f7e8f9dSMark Adams #endif
13798f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
13808f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
13818f7e8f9dSMark Adams 
13828f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
13838f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
13848f7e8f9dSMark Adams   if (b->inode.size) {
13858f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
13868f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
13878f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
13888f7e8f9dSMark Adams   } else {
13898f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
13908f7e8f9dSMark Adams   }
13918f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
13928f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
13938f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
13948f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
13958f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
13968f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
13978f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
13988f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
13998f7e8f9dSMark Adams 
1400930e68a5SMark Adams   PetscFunctionReturn(0);
1401930e68a5SMark Adams }
1402930e68a5SMark Adams 
140386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1404930e68a5SMark Adams {
1405930e68a5SMark Adams   PetscErrorCode   ierr;
1406930e68a5SMark Adams 
1407930e68a5SMark Adams   PetscFunctionBegin;
1408930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
140986a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
141086a27549SJunchao Zhang   PetscFunctionReturn(0);
141186a27549SJunchao Zhang }
141286a27549SJunchao Zhang 
141386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
141486a27549SJunchao Zhang {
141586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
141686a27549SJunchao Zhang 
141786a27549SJunchao Zhang   PetscFunctionBegin;
141886a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
141986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
142086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
142186a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
142286a27549SJunchao Zhang   }
142386a27549SJunchao Zhang   PetscFunctionReturn(0);
142486a27549SJunchao Zhang }
142586a27549SJunchao Zhang 
142686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
142786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
142886a27549SJunchao Zhang {
142986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1430076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
143186a27549SJunchao Zhang 
143286a27549SJunchao Zhang   PetscFunctionBegin;
143386a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
143486a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1435076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
143686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1437076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1438076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
143986a27549SJunchao Zhang 
144086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1441076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1442076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1443076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
144486a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
144586a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
144686a27549SJunchao Zhang 
144786a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
144886a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
144986a27549SJunchao Zhang     */
1450076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1451076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
145286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
145386a27549SJunchao Zhang 
145486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
145586a27549SJunchao Zhang 
145686a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1457076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
145886a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1459076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1460076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
146186a27549SJunchao Zhang 
146286a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1463076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1464076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1465076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
146686a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
146786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
146886a27549SJunchao Zhang 
146986a27549SJunchao Zhang     /* Sort indices. See comments above */
1470076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1471076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
147286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
147386a27549SJunchao Zhang 
147486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
147586a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
147686a27549SJunchao Zhang   }
147786a27549SJunchao Zhang   PetscFunctionReturn(0);
147886a27549SJunchao Zhang }
147986a27549SJunchao Zhang 
148086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
148186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
148286a27549SJunchao Zhang {
148386a27549SJunchao Zhang   PetscErrorCode                 ierr;
148486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
148586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
148686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
148786a27549SJunchao Zhang 
148886a27549SJunchao Zhang   PetscFunctionBegin;
148986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
149086a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
149186a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
149286a27549SJunchao Zhang   /* Solve L tmpv = b */
14933f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
149486a27549SJunchao Zhang   /* Solve Ux = tmpv */
14953f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
149686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
149786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
149886a27549SJunchao Zhang   PetscFunctionReturn(0);
149986a27549SJunchao Zhang }
150086a27549SJunchao Zhang 
1501076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
150286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
150386a27549SJunchao Zhang {
150486a27549SJunchao Zhang   PetscErrorCode                 ierr;
150586a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
150686a27549SJunchao Zhang   PetscScalarKokkosView          xv;
150786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
150886a27549SJunchao Zhang 
150986a27549SJunchao Zhang   PetscFunctionBegin;
151086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
151186a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
151286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
151386a27549SJunchao Zhang   /* Solve U^T tmpv = b */
151486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
151586a27549SJunchao Zhang 
151686a27549SJunchao Zhang   /* Solve L^T x = tmpv */
151786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
151886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
151986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
152086a27549SJunchao Zhang   PetscFunctionReturn(0);
152186a27549SJunchao Zhang }
152286a27549SJunchao Zhang 
152386a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
152486a27549SJunchao Zhang {
152586a27549SJunchao Zhang   PetscErrorCode                 ierr;
152686a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
152786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
152886a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
152986a27549SJunchao Zhang 
153086a27549SJunchao Zhang   PetscFunctionBegin;
153186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1532076ba34aSJunchao Zhang 
1533076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1534076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1535076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1536076ba34aSJunchao Zhang 
1537076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
153886a27549SJunchao Zhang 
153986a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
154086a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
154186a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
154286a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
154386a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
154486a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
154586a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
154686a27549SJunchao Zhang 
154786a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
154886a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
154986a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
155086a27549SJunchao Zhang   PetscFunctionReturn(0);
155186a27549SJunchao Zhang }
155286a27549SJunchao Zhang 
155386a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
155486a27549SJunchao Zhang {
155586a27549SJunchao Zhang   PetscErrorCode                 ierr;
155686a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
155786a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
155886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
155986a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
156086a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
156186a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
156286a27549SJunchao Zhang 
156386a27549SJunchao Zhang   PetscFunctionBegin;
156486a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
156586a27549SJunchao Zhang   /* Rebuild factors */
156686a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
156786a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
156886a27549SJunchao Zhang 
156986a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
157086a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
157186a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
157286a27549SJunchao Zhang 
157386a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
157486a27549SJunchao Zhang 
157586a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
157686a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
157786a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
157886a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
157986a27549SJunchao Zhang 
158086a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1581076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1582076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1583076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
158486a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
158586a27549SJunchao Zhang 
158686a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
158786a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
158886a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
158986a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
159086a27549SJunchao Zhang 
159186a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
159286a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
159386a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
159486a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
159586a27549SJunchao Zhang  #else
159686a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
159786a27549SJunchao Zhang  #endif
159886a27549SJunchao Zhang 
159986a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
160086a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
160186a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
160286a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
160386a27549SJunchao Zhang 
160486a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
160586a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
160686a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
160786a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
160886a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
160986a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
161086a27549SJunchao Zhang 
161186a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
161286a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1613930e68a5SMark Adams   PetscFunctionReturn(0);
1614930e68a5SMark Adams }
1615930e68a5SMark Adams 
16168f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16178f7e8f9dSMark Adams {
16188f7e8f9dSMark Adams   PetscErrorCode   ierr;
16198f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16208f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1621930e68a5SMark Adams 
16228f7e8f9dSMark Adams   PetscFunctionBegin;
16238f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
16248f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16258f7e8f9dSMark Adams   // move B data into Kokkos
16268f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
16278f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
16288f7e8f9dSMark Adams   {
16298f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
16308f7e8f9dSMark Adams     if (!baijkok->diag_d) {
16318f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1632152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
16338f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
16348f7e8f9dSMark Adams     }
16358f7e8f9dSMark Adams   }
16368f7e8f9dSMark Adams   PetscFunctionReturn(0);
16378f7e8f9dSMark Adams }
16388f7e8f9dSMark Adams 
163986a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1640930e68a5SMark Adams {
1641930e68a5SMark Adams   PetscFunctionBegin;
1642930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1643930e68a5SMark Adams   PetscFunctionReturn(0);
1644930e68a5SMark Adams }
1645930e68a5SMark Adams 
16468f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
16478f7e8f9dSMark Adams {
16488f7e8f9dSMark Adams   PetscFunctionBegin;
16498f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
16508f7e8f9dSMark Adams   PetscFunctionReturn(0);
16518f7e8f9dSMark Adams }
16528f7e8f9dSMark Adams 
1653930e68a5SMark Adams /*MC
165486a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
165586a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1656930e68a5SMark Adams 
1657930e68a5SMark Adams   Level: beginner
1658930e68a5SMark Adams 
165986a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1660930e68a5SMark Adams M*/
166186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1662930e68a5SMark Adams {
1663930e68a5SMark Adams   PetscErrorCode ierr;
1664930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1665930e68a5SMark Adams 
1666930e68a5SMark Adams   PetscFunctionBegin;
1667930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1668930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1669930e68a5SMark Adams   (*B)->factortype = ftype;
16704ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1671930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1672930e68a5SMark Adams 
16738f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1674930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167586a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
167686a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
167786a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
167886a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167986a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
168086a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
168186a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1682930e68a5SMark Adams 
1683930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
168486a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1685930e68a5SMark Adams   PetscFunctionReturn(0);
1686930e68a5SMark Adams }
16878f7e8f9dSMark Adams 
16888f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
16898f7e8f9dSMark Adams {
16908f7e8f9dSMark Adams   PetscErrorCode ierr;
16918f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
16928f7e8f9dSMark Adams 
16938f7e8f9dSMark Adams   PetscFunctionBegin;
16948f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
16958f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
16968f7e8f9dSMark Adams   (*B)->factortype = ftype;
1697f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
16984ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
16998f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
17008f7e8f9dSMark Adams 
17018f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
17028f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
17038f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17048f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17058f7e8f9dSMark Adams 
17068f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
17078f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
17088f7e8f9dSMark Adams   PetscFunctionReturn(0);
17098f7e8f9dSMark Adams }
171086a27549SJunchao Zhang 
171186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
171286a27549SJunchao Zhang {
171386a27549SJunchao Zhang   PetscErrorCode ierr;
171486a27549SJunchao Zhang 
171586a27549SJunchao Zhang   PetscFunctionBegin;
171686a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
171786a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
171886a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
171986a27549SJunchao Zhang   PetscFunctionReturn(0);
172086a27549SJunchao Zhang }
172186a27549SJunchao Zhang 
1722076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1723076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1724076ba34aSJunchao Zhang {
1725076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1726076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1727076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1728076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1729076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1730076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1731076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1732076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1733076ba34aSJunchao Zhang 
1734076ba34aSJunchao Zhang   PetscFunctionBegin;
1735076ba34aSJunchao Zhang   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1736076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1737076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1738076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1739076ba34aSJunchao Zhang       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1740076ba34aSJunchao Zhang     }
1741076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1742076ba34aSJunchao Zhang   }
1743076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1744076ba34aSJunchao Zhang }
1745