xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision c17cf69914952e266e86d7403471480e6d7133d3)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
48c3ff71bSJunchao Zhang #include <petscsystypes.h>
58c3ff71bSJunchao Zhang #include <petscerror.h>
68c3ff71bSJunchao Zhang 
78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
8f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35076ba34aSJunchao Zhang 
36076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38076ba34aSJunchao Zhang 
39076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42076ba34aSJunchao Zhang   */
43076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44076ba34aSJunchao Zhang     delete aijkok;
45076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46076ba34aSJunchao Zhang     A->spptr = aijkok;
47076ba34aSJunchao Zhang   }
48076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
6186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
7986a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   PetscFunctionReturn(0);
8686a27549SJunchao Zhang }
8786a27549SJunchao Zhang 
88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89f0cf5187SStefano Zampini {
90f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91f0cf5187SStefano Zampini 
92f0cf5187SStefano Zampini   PetscFunctionBegin;
93f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9486a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9586a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
98f0cf5187SStefano Zampini   PetscFunctionReturn(0);
99f0cf5187SStefano Zampini }
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102f0cf5187SStefano Zampini {
103076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104f0cf5187SStefano Zampini 
105f0cf5187SStefano Zampini   PetscFunctionBegin;
106076ba34aSJunchao Zhang   if (aijkok) {
107076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
108076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
109076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
110076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111076ba34aSJunchao Zhang   }
112076ba34aSJunchao Zhang   PetscFunctionReturn(0);
113076ba34aSJunchao Zhang }
114076ba34aSJunchao Zhang 
115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116076ba34aSJunchao Zhang {
117076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118076ba34aSJunchao Zhang 
119076ba34aSJunchao Zhang   PetscFunctionBegin;
120076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
121076ba34aSJunchao Zhang   PetscFunctionReturn(0);
122076ba34aSJunchao Zhang }
123076ba34aSJunchao Zhang 
124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125076ba34aSJunchao Zhang {
126076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127076ba34aSJunchao Zhang 
128076ba34aSJunchao Zhang   PetscFunctionBegin;
129076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
130076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
131076ba34aSJunchao Zhang   PetscFunctionReturn(0);
132076ba34aSJunchao Zhang }
133076ba34aSJunchao Zhang 
134076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135076ba34aSJunchao Zhang {
136076ba34aSJunchao Zhang   PetscFunctionBegin;
137076ba34aSJunchao Zhang   *array = NULL;
138076ba34aSJunchao Zhang   PetscFunctionReturn(0);
139076ba34aSJunchao Zhang }
140076ba34aSJunchao Zhang 
141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142076ba34aSJunchao Zhang {
143076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144076ba34aSJunchao Zhang 
145076ba34aSJunchao Zhang   PetscFunctionBegin;
146076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
147076ba34aSJunchao Zhang   PetscFunctionReturn(0);
148076ba34aSJunchao Zhang }
149076ba34aSJunchao Zhang 
150076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151076ba34aSJunchao Zhang {
152076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153076ba34aSJunchao Zhang 
154076ba34aSJunchao Zhang   PetscFunctionBegin;
155076ba34aSJunchao Zhang   aijkok->a_dual.clear_sync_state();
156076ba34aSJunchao Zhang   aijkok->a_dual.modify_host();
157f0cf5187SStefano Zampini   PetscFunctionReturn(0);
158f0cf5187SStefano Zampini }
159f0cf5187SStefano Zampini 
160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162a587d139SMark {
163042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165a587d139SMark 
166a587d139SMark   PetscFunctionBegin;
167076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170a587d139SMark   PetscFunctionReturn(0);
171a587d139SMark }
172a587d139SMark 
173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175a587d139SMark {
176042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
179a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
180a587d139SMark     *d_mat = aijkok->device_mat_d.data();
181a587d139SMark   } else {
182a587d139SMark     PetscErrorCode   ierr;
183a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184a587d139SMark     *d_mat  = NULL;
185a587d139SMark   }
186a587d139SMark   PetscFunctionReturn(0);
187a587d139SMark }
188076ba34aSJunchao Zhang 
189076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
190076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191152b3e56SJunchao Zhang {
192152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193152b3e56SJunchao Zhang 
194152b3e56SJunchao Zhang   PetscFunctionBegin;
195076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
199076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
20186a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
202076ba34aSJunchao Zhang   }
203076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
204152b3e56SJunchao Zhang   PetscFunctionReturn(0);
205152b3e56SJunchao Zhang }
206152b3e56SJunchao Zhang 
207076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
208076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209152b3e56SJunchao Zhang {
210152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211152b3e56SJunchao Zhang 
212152b3e56SJunchao Zhang   PetscFunctionBegin;
213076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
216076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
219076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
220076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221076ba34aSJunchao Zhang    #endif
22286a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
223076ba34aSJunchao Zhang   }
224076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
225152b3e56SJunchao Zhang   PetscFunctionReturn(0);
226152b3e56SJunchao Zhang }
227a587d139SMark 
2288c3ff71bSJunchao Zhang /* y = A x */
2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2308c3ff71bSJunchao Zhang {
2318c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
233152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
234152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2358c3ff71bSJunchao Zhang 
2368c3ff71bSJunchao Zhang   PetscFunctionBegin;
2378c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
245076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2478c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2488c3ff71bSJunchao Zhang }
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang /* y = A^T x */
2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2528c3ff71bSJunchao Zhang {
2538c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2548c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
255152b3e56SJunchao Zhang   const char                       *mode;
256152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
257152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
258076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2598c3ff71bSJunchao Zhang 
2608c3ff71bSJunchao Zhang   PetscFunctionBegin;
2618c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
265076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266152b3e56SJunchao Zhang     mode = "N";
267152b3e56SJunchao Zhang   } else {
268076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
270152b3e56SJunchao Zhang     mode = "T";
271152b3e56SJunchao Zhang   }
272076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
276076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2788c3ff71bSJunchao Zhang }
2798c3ff71bSJunchao Zhang 
2808c3ff71bSJunchao Zhang /* y = A^H x */
2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2828c3ff71bSJunchao Zhang {
2838c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
285152b3e56SJunchao Zhang   const char                       *mode;
286152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
287152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
288076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
2918c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
295076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296152b3e56SJunchao Zhang     mode = "N";
297152b3e56SJunchao Zhang   } else {
298076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
300152b3e56SJunchao Zhang     mode = "C";
301152b3e56SJunchao Zhang   }
302076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
306076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3088c3ff71bSJunchao Zhang }
3098c3ff71bSJunchao Zhang 
3108c3ff71bSJunchao Zhang /* z = A x + y */
3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3128c3ff71bSJunchao Zhang {
3138c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3148c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
315152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
316152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3178c3ff71bSJunchao Zhang 
3188c3ff71bSJunchao Zhang   PetscFunctionBegin;
3198c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3248c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
330152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3318c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3328c3ff71bSJunchao Zhang }
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang /* z = A^T x + y */
3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3368c3ff71bSJunchao Zhang {
3378c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3388c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
339152b3e56SJunchao Zhang   const char                       *mode;
340152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
341152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
342076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3438c3ff71bSJunchao Zhang 
3448c3ff71bSJunchao Zhang   PetscFunctionBegin;
3458c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
350152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
351076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352152b3e56SJunchao Zhang     mode = "N";
353152b3e56SJunchao Zhang   } else {
354076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
356152b3e56SJunchao Zhang     mode = "T";
357152b3e56SJunchao Zhang   }
358076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
363076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3658c3ff71bSJunchao Zhang }
3668c3ff71bSJunchao Zhang 
3678c3ff71bSJunchao Zhang /* z = A^H x + y */
3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3698c3ff71bSJunchao Zhang {
3708c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3718c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
372152b3e56SJunchao Zhang   const char                       *mode;
373152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
374152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
375076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3768c3ff71bSJunchao Zhang 
3778c3ff71bSJunchao Zhang   PetscFunctionBegin;
3788c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
383152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
384076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385152b3e56SJunchao Zhang     mode = "N";
386152b3e56SJunchao Zhang   } else {
387076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
389152b3e56SJunchao Zhang     mode = "C";
390152b3e56SJunchao Zhang   }
391076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
396076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397152b3e56SJunchao Zhang   PetscFunctionReturn(0);
398152b3e56SJunchao Zhang }
399152b3e56SJunchao Zhang 
400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401152b3e56SJunchao Zhang {
402152b3e56SJunchao Zhang   PetscErrorCode            ierr;
403152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404152b3e56SJunchao Zhang 
405152b3e56SJunchao Zhang   PetscFunctionBegin;
406152b3e56SJunchao Zhang   switch (op) {
407152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
408152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
411152b3e56SJunchao Zhang       break;
412152b3e56SJunchao Zhang     default:
413152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414152b3e56SJunchao Zhang       break;
415152b3e56SJunchao Zhang   }
4168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4178c3ff71bSJunchao Zhang }
4188c3ff71bSJunchao Zhang 
419076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4218c3ff71bSJunchao Zhang {
4228c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
423076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4248c3ff71bSJunchao Zhang 
4258c3ff71bSJunchao Zhang   PetscFunctionBegin;
426a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4298c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4418c3ff71bSJunchao Zhang     }
442076ba34aSJunchao Zhang   }
4438c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4448c3ff71bSJunchao Zhang }
4458c3ff71bSJunchao Zhang 
446076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448076ba34aSJunchao Zhang  */
449076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4508c3ff71bSJunchao Zhang {
4518c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
452076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
453076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454076ba34aSJunchao Zhang   Mat                   mat;
4558c3ff71bSJunchao Zhang 
4568c3ff71bSJunchao Zhang   PetscFunctionBegin;
457076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459076ba34aSJunchao Zhang   mat  = *B;
460076ba34aSJunchao Zhang   if (A->assembled) {
461076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464076ba34aSJunchao Zhang     /* Now copy values to B if needed */
465076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
466076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
467076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
469076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
470076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
472076ba34aSJunchao Zhang       }
473076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
476076ba34aSJunchao Zhang     }
477076ba34aSJunchao Zhang     mat->spptr = bkok;
478076ba34aSJunchao Zhang   }
479076ba34aSJunchao Zhang 
480076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
4848c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4858c3ff71bSJunchao Zhang }
4868c3ff71bSJunchao Zhang 
4878c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
4888c3ff71bSJunchao Zhang {
4898c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
49086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
4918c3ff71bSJunchao Zhang 
4928c3ff71bSJunchao Zhang   PetscFunctionBegin;
49386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
49486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
4958f7e8f9dSMark Adams     if (aijkok) {
4968f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
497a587d139SMark         delete aijkok->colmap_d;
498a587d139SMark         delete aijkok->i_uncompressed_d;
499a587d139SMark       }
5008f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5018f7e8f9dSMark Adams     }
5028c3ff71bSJunchao Zhang     delete aijkok;
50386a27549SJunchao Zhang   } else {
50486a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
50586a27549SJunchao Zhang   }
506152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
507152b3e56SJunchao Zhang   A->spptr = NULL;
5088c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5098c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5108c3ff71bSJunchao Zhang }
5118c3ff71bSJunchao Zhang 
51286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
51386a27549SJunchao Zhang {
51486a27549SJunchao Zhang   PetscErrorCode ierr;
51586a27549SJunchao Zhang 
51686a27549SJunchao Zhang   PetscFunctionBegin;
51786a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
51886a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
51986a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
52086a27549SJunchao Zhang   PetscFunctionReturn(0);
52186a27549SJunchao Zhang }
52286a27549SJunchao Zhang 
523076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
524076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
525a3f881fbSStefano Zampini {
526076ba34aSJunchao Zhang   PetscErrorCode               ierr;
527076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
528076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
529076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
530076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
531076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
532076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
533a3f881fbSStefano Zampini 
534a3f881fbSStefano Zampini   PetscFunctionBegin;
535076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
536076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
537076ba34aSJunchao Zhang   PetscValidPointer(C,4);
538076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
539076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
540076ba34aSJunchao Zhang   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
541076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
542076ba34aSJunchao Zhang 
543076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
545076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
546076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
547076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
549076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
550076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
551076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
552076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
553076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
554076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
555076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
556076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
557076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
558076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
559076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
560076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
561076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
562076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
563076ba34aSJunchao Zhang     ca = ca_dual.view_device();
564076ba34aSJunchao Zhang     ci = ci_dual.view_device();
565076ba34aSJunchao Zhang     cj = cj_dual.view_device();
566076ba34aSJunchao Zhang 
567076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
568076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
569076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
570076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
571076ba34aSJunchao Zhang 
572076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
573076ba34aSJunchao Zhang         ci(i) = coffset;
574076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
575076ba34aSJunchao Zhang       });
576076ba34aSJunchao Zhang 
577076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
578076ba34aSJunchao Zhang         if (k < alen) {
579076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
580076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
581076ba34aSJunchao Zhang         } else {
582076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
583076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
584076ba34aSJunchao Zhang         }
585076ba34aSJunchao Zhang       });
586076ba34aSJunchao Zhang     });
587076ba34aSJunchao Zhang     ca_dual.modify_device();
588076ba34aSJunchao Zhang     ci_dual.modify_device();
589076ba34aSJunchao Zhang     cj_dual.modify_device();
590076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
591076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
592076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
593076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
594076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
595076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
596076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
597076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
598076ba34aSJunchao Zhang 
599076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
601076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
603076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
604076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
605076ba34aSJunchao Zhang       });
606076ba34aSJunchao Zhang     });
607076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
608076ba34aSJunchao Zhang   }
609076ba34aSJunchao Zhang   PetscFunctionReturn(0);
610076ba34aSJunchao Zhang }
611076ba34aSJunchao Zhang 
612076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
613076ba34aSJunchao Zhang {
614076ba34aSJunchao Zhang   PetscFunctionBegin;
615076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
616a3f881fbSStefano Zampini   PetscFunctionReturn(0);
617a3f881fbSStefano Zampini }
618a3f881fbSStefano Zampini 
619a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
620a3f881fbSStefano Zampini {
621076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
622a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
623a3f881fbSStefano Zampini   Mat                            A,B;
624076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
625a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
626a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
627076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
628076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
629a3f881fbSStefano Zampini 
630a3f881fbSStefano Zampini   PetscFunctionBegin;
631a3f881fbSStefano Zampini   MatCheckProduct(C,1);
632a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
633076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
634076ba34aSJunchao Zhang 
635076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
636076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
637076ba34aSJunchao Zhang     PetscFunctionReturn(0);
638076ba34aSJunchao Zhang   }
639076ba34aSJunchao Zhang 
640076ba34aSJunchao Zhang   switch (product->type) {
641076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
642076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
643076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
644076ba34aSJunchao Zhang     default:
645076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
646076ba34aSJunchao Zhang   }
647076ba34aSJunchao Zhang 
648a3f881fbSStefano Zampini   A     = product->A;
649a3f881fbSStefano Zampini   B     = product->B;
650a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
651a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
652a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
653a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
654a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
655076ba34aSJunchao Zhang 
656076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
657076ba34aSJunchao Zhang 
658076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
659076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
660076ba34aSJunchao Zhang 
661076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
662076ba34aSJunchao Zhang   if (transA) {
663076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
664076ba34aSJunchao Zhang     transA = false;
665a3f881fbSStefano Zampini   }
666a3f881fbSStefano Zampini 
667076ba34aSJunchao Zhang   if (transB) {
668076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
669076ba34aSJunchao Zhang     transB = false;
670076ba34aSJunchao Zhang   }
671076ba34aSJunchao Zhang 
672076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
673076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
674076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
675a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
676a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
677a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
678a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
679a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
680a3f881fbSStefano Zampini   c->reallocs         = 0;
681076ba34aSJunchao Zhang   C->info.mallocs     = 0;
682a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
683a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
684a3f881fbSStefano Zampini   C->num_ass++;
685a3f881fbSStefano Zampini   PetscFunctionReturn(0);
686a3f881fbSStefano Zampini }
687a3f881fbSStefano Zampini 
688a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
689a3f881fbSStefano Zampini {
690a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
691076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
692076ba34aSJunchao Zhang   MatProductType                 ptype;
693076ba34aSJunchao Zhang   Mat                            A,B;
694076ba34aSJunchao Zhang   bool                           transA,transB;
695076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
697076ba34aSJunchao Zhang   MPI_Comm                       comm;
698076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
699a3f881fbSStefano Zampini 
700a3f881fbSStefano Zampini   PetscFunctionBegin;
701a3f881fbSStefano Zampini   MatCheckProduct(C,1);
702076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
703076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
704a3f881fbSStefano Zampini   A       = product->A;
705a3f881fbSStefano Zampini   B       = product->B;
706a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
708a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
709a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
710076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
711076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
712076ba34aSJunchao Zhang 
713a3f881fbSStefano Zampini   ptype   = product->type;
714a3f881fbSStefano Zampini   switch (ptype) {
715076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
716076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
717076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
718a3f881fbSStefano Zampini     default:
719076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
720a3f881fbSStefano Zampini   }
721a3f881fbSStefano Zampini 
722076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
723076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
724076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
725076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
726a3f881fbSStefano Zampini 
727076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
728076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
729076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
730076ba34aSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
731076ba34aSJunchao Zhang     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
732076ba34aSJunchao Zhang     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
733076ba34aSJunchao Zhang   #endif
734076ba34aSJunchao Zhang #endif
735076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
736076ba34aSJunchao Zhang 
737076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
738076ba34aSJunchao Zhang   if (transA) {
739076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
740076ba34aSJunchao Zhang     transA = false;
741076ba34aSJunchao Zhang   }
742076ba34aSJunchao Zhang 
743076ba34aSJunchao Zhang   if (transB) {
744076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
745076ba34aSJunchao Zhang     transB = false;
746076ba34aSJunchao Zhang   }
747076ba34aSJunchao Zhang 
748076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
749076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
750076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
751076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
752076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
753076ba34aSJunchao Zhang   */
754076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
755076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
756076ba34aSJunchao Zhang 
757076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
758076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
759076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
760a3f881fbSStefano Zampini   PetscFunctionReturn(0);
761a3f881fbSStefano Zampini }
762a3f881fbSStefano Zampini 
763a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
764076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
765a3f881fbSStefano Zampini {
766a3f881fbSStefano Zampini   PetscErrorCode ierr;
767076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
768a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
769a3f881fbSStefano Zampini 
770a3f881fbSStefano Zampini   PetscFunctionBegin;
771a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
772a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
773a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
774a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
775a3f881fbSStefano Zampini   }
776a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
777a3f881fbSStefano Zampini     switch (product->type) {
778a3f881fbSStefano Zampini     case MATPRODUCT_AB:
779a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
780a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
781a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
782a3f881fbSStefano Zampini       break;
783a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
784a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
785a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
786a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
787a3f881fbSStefano Zampini       break;
788a3f881fbSStefano Zampini     default:
789a3f881fbSStefano Zampini       break;
790a3f881fbSStefano Zampini     }
791a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
792a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
793a3f881fbSStefano Zampini   }
794a3f881fbSStefano Zampini   PetscFunctionReturn(0);
795a3f881fbSStefano Zampini }
796a587d139SMark 
797f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
798f0cf5187SStefano Zampini {
799f0cf5187SStefano Zampini   PetscErrorCode   ierr;
800f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
801f0cf5187SStefano Zampini 
802f0cf5187SStefano Zampini   PetscFunctionBegin;
803f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
804f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
805076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
806076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
807f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
808076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
809f0cf5187SStefano Zampini   PetscFunctionReturn(0);
810f0cf5187SStefano Zampini }
811f0cf5187SStefano Zampini 
812a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
813a587d139SMark {
814a587d139SMark   PetscErrorCode   ierr;
815076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
816a587d139SMark 
817a587d139SMark   PetscFunctionBegin;
818076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
819076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
820076ba34aSJunchao Zhang   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
821076ba34aSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
822076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
823a587d139SMark   PetscFunctionReturn(0);
824a587d139SMark }
825a587d139SMark 
826db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
827db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
828db78de30SJunchao Zhang {
829db78de30SJunchao Zhang   PetscErrorCode     ierr;
830db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
831db78de30SJunchao Zhang 
832db78de30SJunchao Zhang   PetscFunctionBegin;
833db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
834db78de30SJunchao Zhang   PetscValidPointer(kv,2);
835db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
836db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
837db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
838076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
839db78de30SJunchao Zhang   PetscFunctionReturn(0);
840db78de30SJunchao Zhang }
841db78de30SJunchao Zhang 
842db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
843db78de30SJunchao Zhang {
844db78de30SJunchao Zhang   PetscFunctionBegin;
845db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
846db78de30SJunchao Zhang   PetscValidPointer(kv,2);
847db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
848db78de30SJunchao Zhang   PetscFunctionReturn(0);
849db78de30SJunchao Zhang }
850db78de30SJunchao Zhang 
851db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
852db78de30SJunchao Zhang {
853db78de30SJunchao Zhang   PetscErrorCode     ierr;
854db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
855db78de30SJunchao Zhang 
856db78de30SJunchao Zhang   PetscFunctionBegin;
857db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
858db78de30SJunchao Zhang   PetscValidPointer(kv,2);
859db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
860db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
861db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
862076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
863db78de30SJunchao Zhang   PetscFunctionReturn(0);
864db78de30SJunchao Zhang }
865db78de30SJunchao Zhang 
866db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
867db78de30SJunchao Zhang {
868db78de30SJunchao Zhang   PetscErrorCode     ierr;
869db78de30SJunchao Zhang 
870db78de30SJunchao Zhang   PetscFunctionBegin;
871db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
872db78de30SJunchao Zhang   PetscValidPointer(kv,2);
873db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
874076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
875db78de30SJunchao Zhang   PetscFunctionReturn(0);
876db78de30SJunchao Zhang }
877db78de30SJunchao Zhang 
878db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
879db78de30SJunchao Zhang {
880db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
881db78de30SJunchao Zhang 
882db78de30SJunchao Zhang   PetscFunctionBegin;
883db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
884db78de30SJunchao Zhang   PetscValidPointer(kv,2);
885db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
886db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
888db78de30SJunchao Zhang   PetscFunctionReturn(0);
889db78de30SJunchao Zhang }
890db78de30SJunchao Zhang 
891db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
892db78de30SJunchao Zhang {
893db78de30SJunchao Zhang   PetscErrorCode     ierr;
894db78de30SJunchao Zhang 
895db78de30SJunchao Zhang   PetscFunctionBegin;
896db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
897db78de30SJunchao Zhang   PetscValidPointer(kv,2);
898db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
899076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
900db78de30SJunchao Zhang   PetscFunctionReturn(0);
901db78de30SJunchao Zhang }
902db78de30SJunchao Zhang 
903*c17cf699SJunchao Zhang /* Computes Y += alpha X */
904*c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
905a587d139SMark {
906a587d139SMark   PetscErrorCode             ierr;
907a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
908*c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
909*c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
910*c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
911a587d139SMark 
912a587d139SMark   PetscFunctionBegin;
913*c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
914*c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
915*c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
916*c17cf699SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
917db78de30SJunchao Zhang 
918*c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
919*c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
920a587d139SMark     PetscBool e;
921a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
922a587d139SMark     if (e) {
923a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
924*c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
925a587d139SMark     }
926a587d139SMark   }
927db78de30SJunchao Zhang 
928*c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
929*c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
930*c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
931*c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
932*c17cf699SJunchao Zhang   */
933*c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
934*c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
935*c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
936*c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
937*c17cf699SJunchao Zhang 
938*c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
939*c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
940*c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
941*c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
942*c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
943*c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
944*c17cf699SJunchao Zhang 
945*c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
946*c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
947*c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
948*c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
949*c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
950*c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
951*c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
952*c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
953*c17cf699SJunchao Zhang             q++;
954a587d139SMark           } else {
955*c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
956*c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
957*c17cf699SJunchao Zhang             */
958*c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
959a587d139SMark           }
960*c17cf699SJunchao Zhang         }
961*c17cf699SJunchao Zhang       });
962*c17cf699SJunchao Zhang     });
963*c17cf699SJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(Y);
964*c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
965*c17cf699SJunchao Zhang     Mat             Z;
966*c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
967*c17cf699SJunchao Zhang     KernelHandle    kh;
968*c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
969*c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
970*c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
971*c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
972*c17cf699SJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr);
973*c17cf699SJunchao Zhang     ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr);
974*c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
975*c17cf699SJunchao Zhang   }
976*c17cf699SJunchao Zhang 
977a587d139SMark   PetscFunctionReturn(0);
978a587d139SMark }
979a587d139SMark 
98086a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
9818f7e8f9dSMark Adams {
9828f7e8f9dSMark Adams   PetscErrorCode ierr;
9838f7e8f9dSMark Adams 
9848f7e8f9dSMark Adams   PetscFunctionBegin;
9858f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
9868f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
9878f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
9888f7e8f9dSMark Adams   PetscFunctionReturn(0);
9898f7e8f9dSMark Adams }
9908f7e8f9dSMark Adams 
9918c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
9928c3ff71bSJunchao Zhang {
993076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
994076ba34aSJunchao Zhang 
9958c3ff71bSJunchao Zhang   PetscFunctionBegin;
996076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
9976f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
9986f3d89d0SStefano Zampini 
9998c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
10008c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
10018c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1002a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1003f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1004a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1005076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
10068c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
10078c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
10088c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
10098c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
10108c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
10118c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1012076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1013152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1014076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1015076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1016076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1017076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1018076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1019076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1020076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1021076ba34aSJunchao Zhang }
1022076ba34aSJunchao Zhang 
1023076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1024076ba34aSJunchao Zhang {
1025076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1026076ba34aSJunchao Zhang   Mat_SeqAIJ         *aseq;
1027076ba34aSJunchao Zhang   PetscInt           i,m,n;
1028076ba34aSJunchao Zhang 
1029076ba34aSJunchao Zhang   PetscFunctionBegin;
1030076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1031076ba34aSJunchao Zhang 
1032076ba34aSJunchao Zhang   m    = akok->nrows();
1033076ba34aSJunchao Zhang   n    = akok->ncols();
1034076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
1035076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1036076ba34aSJunchao Zhang 
1037076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1038076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1039076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1040076ba34aSJunchao Zhang 
1041076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1042076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1043076ba34aSJunchao Zhang 
1044076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1045076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1046076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1047076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1048076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1049076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1050076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1051076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1052076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1053076ba34aSJunchao Zhang 
1054076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1055076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1056076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1057076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1058076ba34aSJunchao Zhang   }
1059076ba34aSJunchao Zhang 
1060076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1061076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1062076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1063076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1064076ba34aSJunchao Zhang   A->spptr = akok;
1065076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1066076ba34aSJunchao Zhang }
1067076ba34aSJunchao Zhang 
1068076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1069076ba34aSJunchao Zhang 
1070076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1071076ba34aSJunchao Zhang  */
1072076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1073076ba34aSJunchao Zhang {
1074076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1075076ba34aSJunchao Zhang 
1076076ba34aSJunchao Zhang   PetscFunctionBegin;
1077076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1078076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
10798c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
10808c3ff71bSJunchao Zhang }
10818c3ff71bSJunchao Zhang 
10828c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1083152b3e56SJunchao Zhang /*@C
10848c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
10858c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
10868c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
10878c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
10888c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
10898c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
10908c3ff71bSJunchao Zhang 
10918c3ff71bSJunchao Zhang    Collective
10928c3ff71bSJunchao Zhang 
10938c3ff71bSJunchao Zhang    Input Parameters:
10948c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
10958c3ff71bSJunchao Zhang .  m - number of rows
10968c3ff71bSJunchao Zhang .  n - number of columns
10978c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
10988c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
10998c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
11008c3ff71bSJunchao Zhang 
11018c3ff71bSJunchao Zhang    Output Parameter:
11028c3ff71bSJunchao Zhang .  A - the matrix
11038c3ff71bSJunchao Zhang 
11048c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
11058c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
11068c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
11078c3ff71bSJunchao Zhang 
11088c3ff71bSJunchao Zhang    Notes:
11098c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
11108c3ff71bSJunchao Zhang 
11118c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
11128c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
11138c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
11148c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
11158c3ff71bSJunchao Zhang 
11168c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
11178c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11188c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
11198c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
11208c3ff71bSJunchao Zhang 
11218c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
11228c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
11238c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
11248c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
11258c3ff71bSJunchao Zhang 
11268c3ff71bSJunchao Zhang    Level: intermediate
11278c3ff71bSJunchao Zhang 
1128076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
11298c3ff71bSJunchao Zhang @*/
11308c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
11318c3ff71bSJunchao Zhang {
11328c3ff71bSJunchao Zhang   PetscErrorCode ierr;
11338c3ff71bSJunchao Zhang 
11348c3ff71bSJunchao Zhang   PetscFunctionBegin;
11358c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
11368c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
11378c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
11388c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11398c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11408c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11418c3ff71bSJunchao Zhang }
1142930e68a5SMark Adams 
11438f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
11448f7e8f9dSMark Adams //
11458f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
11468f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
11478f7e8f9dSMark Adams //
11488f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1149930e68a5SMark Adams {
11508f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
11518f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
11528f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1153930e68a5SMark Adams   PetscErrorCode     ierr;
11548f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1155076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1156076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1157076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
11588f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
11598f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
11608f7e8f9dSMark Adams   PetscContainer     container;
1161930e68a5SMark Adams 
1162930e68a5SMark Adams   PetscFunctionBegin;
11638f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
11648f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
11658f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
11668f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
11678f7e8f9dSMark Adams   if (container) {
11688f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
11698f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
11708f7e8f9dSMark Adams     Nf = (*pNf)%1000;
11718f7e8f9dSMark Adams     nv = (*pNf)/1000;
11728f7e8f9dSMark Adams     if (nv>0) nVec = nv;
11738f7e8f9dSMark Adams   } else Nf = 1;
11748f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
11758f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
11768f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
11778f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
11788f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
11798f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
11808f7e8f9dSMark Adams #endif
11818f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
11828f7e8f9dSMark Adams   {
11838f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
11848f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
11858f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
11868f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
11878f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
11888f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
11898f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
11908f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
11918f7e8f9dSMark Adams     size_t flops_h = 0.0;
11928f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
11938f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
11948f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
11958f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
11968f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
11978f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
11988f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
11998f7e8f9dSMark Adams     // Fill A --> fact
12008f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1201042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
12028f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
12038f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
12048f7e8f9dSMark Adams         // zero rows of B
12058f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12068f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
12078f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
12088f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
12098f7e8f9dSMark Adams             /* zero (unfactored row) */
12108f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
12118f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
12128f7e8f9dSMark Adams           });
12138f7e8f9dSMark Adams         // copy A into B
12148f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12158f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
12168f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
12178f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
12188f7e8f9dSMark Adams             /* load in initial (unfactored row) */
12198f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
12208f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
12218f7e8f9dSMark Adams               PetscScalar vala = av[j];
12228f7e8f9dSMark Adams               if (colb == rowb) {
12238f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
12248f7e8f9dSMark Adams               } else {
12258f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12268f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12278f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
12288f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
12298f7e8f9dSMark Adams                   if (pbj[j] == colb) {
12308f7e8f9dSMark Adams                     pba[j] = vala;
12318f7e8f9dSMark Adams                     set++;
12328f7e8f9dSMark Adams                     break;
12338f7e8f9dSMark Adams                   }
12348f7e8f9dSMark Adams                 }
12358f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
12368f7e8f9dSMark Adams               }
12378f7e8f9dSMark Adams             }
12388f7e8f9dSMark Adams           });
12398f7e8f9dSMark Adams       });
12408f7e8f9dSMark Adams     Kokkos::fence();
1241930e68a5SMark Adams 
12428f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
12438f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12448f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12458f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1246042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
12478f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
12488f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
12498f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
12508f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
12518f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
12528f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
12538f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
12548f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
12558f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
12568f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
12578f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
12588f7e8f9dSMark Adams               else {
12598f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
12608f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
12618f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
12628f7e8f9dSMark Adams                 size_t         st_idx;
12638f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
12648f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
12658f7e8f9dSMark Adams                 // get column, there has got to be a better way
12668f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
12678f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
12688f7e8f9dSMark Adams                     if (pjL[j] == ii) {
12698f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
12708f7e8f9dSMark Adams                       idx = j; // output
12718f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
12728f7e8f9dSMark Adams                     }
12738f7e8f9dSMark Adams                   }, st_idx);
12748f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
12758f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
12768f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
12778f7e8f9dSMark Adams                   // U(i+1,:end)
12788f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
12798f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
12808f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
12818f7e8f9dSMark Adams                       if (col==myk) {
12828f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
12838f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
12848f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
12858f7e8f9dSMark Adams                       } else {
12868f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
12878f7e8f9dSMark Adams                         PetscInt       high, low;
12888f7e8f9dSMark Adams                         const PetscInt *startj;
12898f7e8f9dSMark Adams                         if (col<myk) { // L
12908f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
12918f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
12928f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
12938f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
12948f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
12958f7e8f9dSMark Adams                         } else {
12968f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
12978f7e8f9dSMark Adams                           start = ba_d + idx;
12988f7e8f9dSMark Adams                           startj= bj_d + idx;
12998f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
13008f7e8f9dSMark Adams                         }
13018f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
13028f7e8f9dSMark Adams                         low  = 0;
13038f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
13048f7e8f9dSMark Adams                         while (high-low > 5) {
13058f7e8f9dSMark Adams                           int t = (low+high)/2;
13068f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
13078f7e8f9dSMark Adams                           else                 low  = t;
13088f7e8f9dSMark Adams                         }
13098f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
13108f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
13118f7e8f9dSMark Adams                         }
13128f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
13138f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
13148f7e8f9dSMark Adams                       }
13158f7e8f9dSMark Adams                     });
13168f7e8f9dSMark Adams                 }
13178f7e8f9dSMark Adams               }
13188f7e8f9dSMark Adams             });
13198f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
13208f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
13218f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
13228f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
13238f7e8f9dSMark Adams       });
13248f7e8f9dSMark Adams     Kokkos::fence();
13258f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
13268f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13278f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13288f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
13298f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
13308f7e8f9dSMark Adams #endif
13318f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
13328f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
13338f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
13348f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
13358f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
13368f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
13378f7e8f9dSMark Adams             if (i < end) {
13388f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
13398f7e8f9dSMark Adams               *pv = 1.0/(*pv);
13408f7e8f9dSMark Adams             }
13418f7e8f9dSMark Adams           });
13428f7e8f9dSMark Adams       });
13438f7e8f9dSMark Adams   }
13448f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13458f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
13468f7e8f9dSMark Adams #endif
13478f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
13488f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
13498f7e8f9dSMark Adams 
13508f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
13518f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
13528f7e8f9dSMark Adams   if (b->inode.size) {
13538f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
13548f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
13558f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
13568f7e8f9dSMark Adams   } else {
13578f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
13588f7e8f9dSMark Adams   }
13598f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
13608f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
13618f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
13628f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
13638f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
13648f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
13658f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
13668f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
13678f7e8f9dSMark Adams 
1368930e68a5SMark Adams   PetscFunctionReturn(0);
1369930e68a5SMark Adams }
1370930e68a5SMark Adams 
137186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1372930e68a5SMark Adams {
1373930e68a5SMark Adams   PetscErrorCode   ierr;
1374930e68a5SMark Adams 
1375930e68a5SMark Adams   PetscFunctionBegin;
1376930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
137786a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
137886a27549SJunchao Zhang   PetscFunctionReturn(0);
137986a27549SJunchao Zhang }
138086a27549SJunchao Zhang 
138186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
138286a27549SJunchao Zhang {
138386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
138486a27549SJunchao Zhang 
138586a27549SJunchao Zhang   PetscFunctionBegin;
138686a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
138786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
138886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
138986a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
139086a27549SJunchao Zhang   }
139186a27549SJunchao Zhang   PetscFunctionReturn(0);
139286a27549SJunchao Zhang }
139386a27549SJunchao Zhang 
139486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
139586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
139686a27549SJunchao Zhang {
139786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1398076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
139986a27549SJunchao Zhang 
140086a27549SJunchao Zhang   PetscFunctionBegin;
140186a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
140286a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1403076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
140486a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1405076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1406076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
140786a27549SJunchao Zhang 
140886a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1409076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1410076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1411076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
141286a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
141386a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
141486a27549SJunchao Zhang 
141586a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
141686a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
141786a27549SJunchao Zhang     */
1418076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1419076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
142086a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
142186a27549SJunchao Zhang 
142286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
142386a27549SJunchao Zhang 
142486a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1425076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
142686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1427076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1428076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
142986a27549SJunchao Zhang 
143086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1431076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1432076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1433076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
143486a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
143586a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
143686a27549SJunchao Zhang 
143786a27549SJunchao Zhang     /* Sort indices. See comments above */
1438076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1439076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
144086a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
144186a27549SJunchao Zhang 
144286a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
144386a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
144486a27549SJunchao Zhang   }
144586a27549SJunchao Zhang   PetscFunctionReturn(0);
144686a27549SJunchao Zhang }
144786a27549SJunchao Zhang 
144886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
144986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
145086a27549SJunchao Zhang {
145186a27549SJunchao Zhang   PetscErrorCode                 ierr;
145286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
145386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
145486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
145586a27549SJunchao Zhang 
145686a27549SJunchao Zhang   PetscFunctionBegin;
145786a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
145886a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
145986a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
146086a27549SJunchao Zhang   /* Solve L tmpv = b */
14613f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
146286a27549SJunchao Zhang   /* Solve Ux = tmpv */
14633f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
146486a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
146586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
146686a27549SJunchao Zhang   PetscFunctionReturn(0);
146786a27549SJunchao Zhang }
146886a27549SJunchao Zhang 
1469076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
147086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
147186a27549SJunchao Zhang {
147286a27549SJunchao Zhang   PetscErrorCode                 ierr;
147386a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
147486a27549SJunchao Zhang   PetscScalarKokkosView          xv;
147586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
147686a27549SJunchao Zhang 
147786a27549SJunchao Zhang   PetscFunctionBegin;
147886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
147986a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
148086a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
148186a27549SJunchao Zhang   /* Solve U^T tmpv = b */
148286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
148386a27549SJunchao Zhang 
148486a27549SJunchao Zhang   /* Solve L^T x = tmpv */
148586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
148686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
148786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
148886a27549SJunchao Zhang   PetscFunctionReturn(0);
148986a27549SJunchao Zhang }
149086a27549SJunchao Zhang 
149186a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
149286a27549SJunchao Zhang {
149386a27549SJunchao Zhang   PetscErrorCode                 ierr;
149486a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
149586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
149686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
149786a27549SJunchao Zhang 
149886a27549SJunchao Zhang   PetscFunctionBegin;
149986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1500076ba34aSJunchao Zhang 
1501076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1502076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1503076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1504076ba34aSJunchao Zhang 
1505076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
150686a27549SJunchao Zhang 
150786a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
150886a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
150986a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
151086a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
151186a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
151286a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
151386a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
151486a27549SJunchao Zhang 
151586a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
151686a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
151786a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
151886a27549SJunchao Zhang   PetscFunctionReturn(0);
151986a27549SJunchao Zhang }
152086a27549SJunchao Zhang 
152186a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
152286a27549SJunchao Zhang {
152386a27549SJunchao Zhang   PetscErrorCode                 ierr;
152486a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
152586a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
152686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
152786a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
152886a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
152986a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
153086a27549SJunchao Zhang 
153186a27549SJunchao Zhang   PetscFunctionBegin;
153286a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
153386a27549SJunchao Zhang   /* Rebuild factors */
153486a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
153586a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
153686a27549SJunchao Zhang 
153786a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
153886a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
153986a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
154086a27549SJunchao Zhang 
154186a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
154286a27549SJunchao Zhang 
154386a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
154486a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
154586a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
154686a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
154786a27549SJunchao Zhang 
154886a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1549076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1550076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1551076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
155286a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
155386a27549SJunchao Zhang 
155486a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
155586a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
155686a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
155786a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
155886a27549SJunchao Zhang 
155986a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
156086a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
156186a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
156286a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
156386a27549SJunchao Zhang  #else
156486a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
156586a27549SJunchao Zhang  #endif
156686a27549SJunchao Zhang 
156786a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
156886a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
156986a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
157086a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
157186a27549SJunchao Zhang 
157286a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
157386a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
157486a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
157586a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
157686a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
157786a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
157886a27549SJunchao Zhang 
157986a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
158086a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1581930e68a5SMark Adams   PetscFunctionReturn(0);
1582930e68a5SMark Adams }
1583930e68a5SMark Adams 
15848f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
15858f7e8f9dSMark Adams {
15868f7e8f9dSMark Adams   PetscErrorCode   ierr;
15878f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
15888f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1589930e68a5SMark Adams 
15908f7e8f9dSMark Adams   PetscFunctionBegin;
15918f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
15928f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
15938f7e8f9dSMark Adams   // move B data into Kokkos
15948f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
15958f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
15968f7e8f9dSMark Adams   {
15978f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
15988f7e8f9dSMark Adams     if (!baijkok->diag_d) {
15998f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1600152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
16018f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
16028f7e8f9dSMark Adams     }
16038f7e8f9dSMark Adams   }
16048f7e8f9dSMark Adams   PetscFunctionReturn(0);
16058f7e8f9dSMark Adams }
16068f7e8f9dSMark Adams 
160786a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1608930e68a5SMark Adams {
1609930e68a5SMark Adams   PetscFunctionBegin;
1610930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1611930e68a5SMark Adams   PetscFunctionReturn(0);
1612930e68a5SMark Adams }
1613930e68a5SMark Adams 
16148f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
16158f7e8f9dSMark Adams {
16168f7e8f9dSMark Adams   PetscFunctionBegin;
16178f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
16188f7e8f9dSMark Adams   PetscFunctionReturn(0);
16198f7e8f9dSMark Adams }
16208f7e8f9dSMark Adams 
1621930e68a5SMark Adams /*MC
162286a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
162386a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1624930e68a5SMark Adams 
1625930e68a5SMark Adams   Level: beginner
1626930e68a5SMark Adams 
162786a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1628930e68a5SMark Adams M*/
162986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1630930e68a5SMark Adams {
1631930e68a5SMark Adams   PetscErrorCode ierr;
1632930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1633930e68a5SMark Adams 
1634930e68a5SMark Adams   PetscFunctionBegin;
1635930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1636930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1637930e68a5SMark Adams   (*B)->factortype = ftype;
16384ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1639930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1640930e68a5SMark Adams 
16418f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1642930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
164386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
164486a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
164586a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
164686a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
164786a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
164886a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
164986a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1650930e68a5SMark Adams 
1651930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
165286a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1653930e68a5SMark Adams   PetscFunctionReturn(0);
1654930e68a5SMark Adams }
16558f7e8f9dSMark Adams 
16568f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
16578f7e8f9dSMark Adams {
16588f7e8f9dSMark Adams   PetscErrorCode ierr;
16598f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
16608f7e8f9dSMark Adams 
16618f7e8f9dSMark Adams   PetscFunctionBegin;
16628f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
16638f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
16648f7e8f9dSMark Adams   (*B)->factortype = ftype;
1665f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
16664ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
16678f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
16688f7e8f9dSMark Adams 
16698f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
16708f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
16718f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
16728f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
16738f7e8f9dSMark Adams 
16748f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
16758f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
16768f7e8f9dSMark Adams   PetscFunctionReturn(0);
16778f7e8f9dSMark Adams }
167886a27549SJunchao Zhang 
167986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
168086a27549SJunchao Zhang {
168186a27549SJunchao Zhang   PetscErrorCode ierr;
168286a27549SJunchao Zhang 
168386a27549SJunchao Zhang   PetscFunctionBegin;
168486a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
168586a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
168686a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
168786a27549SJunchao Zhang   PetscFunctionReturn(0);
168886a27549SJunchao Zhang }
168986a27549SJunchao Zhang 
1690076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1691076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1692076ba34aSJunchao Zhang {
1693076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1694076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1695076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1696076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1697076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1698076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1699076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1700076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1701076ba34aSJunchao Zhang 
1702076ba34aSJunchao Zhang   PetscFunctionBegin;
1703076ba34aSJunchao Zhang   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1704076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1705076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1706076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1707076ba34aSJunchao Zhang       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1708076ba34aSJunchao Zhang     }
1709076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1710076ba34aSJunchao Zhang   }
1711076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1712076ba34aSJunchao Zhang }
1713