xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 076ba34a670039b3127ea98935d34fd539c66ade)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2*076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
48c3ff71bSJunchao Zhang #include <petscsystypes.h>
58c3ff71bSJunchao Zhang #include <petscerror.h>
68c3ff71bSJunchao Zhang 
78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
8f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
9*076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
14*076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
15*076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1686a27549SJunchao Zhang 
178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22*076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23*076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24*076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25*076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
288c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
29*076ba34aSJunchao Zhang   Mat_SeqAIJ        *aijseq;
30*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos  *aijkok;
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang   PetscFunctionBegin;
33*076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
348c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
35*076ba34aSJunchao Zhang 
36*076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
37*076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
38*076ba34aSJunchao Zhang 
39*076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
40*076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
41*076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
42*076ba34aSJunchao Zhang   */
43*076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
44*076ba34aSJunchao Zhang     delete aijkok;
45*076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
46*076ba34aSJunchao Zhang     A->spptr = aijkok;
47*076ba34aSJunchao Zhang   }
48*076ba34aSJunchao Zhang 
49a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
50a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
51a587d139SMark   }
528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
538c3ff71bSJunchao Zhang }
548c3ff71bSJunchao Zhang 
5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
56*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
578c3ff71bSJunchao Zhang {
588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
598c3ff71bSJunchao Zhang 
608c3ff71bSJunchao Zhang   PetscFunctionBegin;
6186a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
62*076ba34aSJunchao Zhang   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device");
63*076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
64*076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
65*076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
66*076ba34aSJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */
6786a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
688c3ff71bSJunchao Zhang   }
698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
708c3ff71bSJunchao Zhang }
718c3ff71bSJunchao Zhang 
72*076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
73*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7486a27549SJunchao Zhang {
7586a27549SJunchao Zhang   PetscErrorCode   ierr;
7686a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7786a27549SJunchao Zhang 
7886a27549SJunchao Zhang   PetscFunctionBegin;
7986a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
8086a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
8186a27549SJunchao Zhang   aijkok->a_dual.modify_device();
8286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8386a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
8486a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
8586a27549SJunchao Zhang   PetscFunctionReturn(0);
8686a27549SJunchao Zhang }
8786a27549SJunchao Zhang 
88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
89f0cf5187SStefano Zampini {
90f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
91f0cf5187SStefano Zampini 
92f0cf5187SStefano Zampini   PetscFunctionBegin;
93f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9486a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
9586a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
96f0cf5187SStefano Zampini   if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
97*076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
98f0cf5187SStefano Zampini   PetscFunctionReturn(0);
99f0cf5187SStefano Zampini }
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
102f0cf5187SStefano Zampini {
103*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
104f0cf5187SStefano Zampini 
105f0cf5187SStefano Zampini   PetscFunctionBegin;
106*076ba34aSJunchao Zhang   if (aijkok) {
107*076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
108*076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
109*076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
110*076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
111*076ba34aSJunchao Zhang   }
112*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
113*076ba34aSJunchao Zhang }
114*076ba34aSJunchao Zhang 
115*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
116*076ba34aSJunchao Zhang {
117*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
118*076ba34aSJunchao Zhang 
119*076ba34aSJunchao Zhang   PetscFunctionBegin;
120*076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
121*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
122*076ba34aSJunchao Zhang }
123*076ba34aSJunchao Zhang 
124*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
125*076ba34aSJunchao Zhang {
126*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
127*076ba34aSJunchao Zhang 
128*076ba34aSJunchao Zhang   PetscFunctionBegin;
129*076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
130*076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
131*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
132*076ba34aSJunchao Zhang }
133*076ba34aSJunchao Zhang 
134*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
135*076ba34aSJunchao Zhang {
136*076ba34aSJunchao Zhang   PetscFunctionBegin;
137*076ba34aSJunchao Zhang   *array = NULL;
138*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
139*076ba34aSJunchao Zhang }
140*076ba34aSJunchao Zhang 
141*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
142*076ba34aSJunchao Zhang {
143*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
144*076ba34aSJunchao Zhang 
145*076ba34aSJunchao Zhang   PetscFunctionBegin;
146*076ba34aSJunchao Zhang   *array = aijkok->a_dual.view_host().data();
147*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
148*076ba34aSJunchao Zhang }
149*076ba34aSJunchao Zhang 
150*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
151*076ba34aSJunchao Zhang {
152*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
153*076ba34aSJunchao Zhang 
154*076ba34aSJunchao Zhang   PetscFunctionBegin;
155*076ba34aSJunchao Zhang   aijkok->a_dual.clear_sync_state();
156*076ba34aSJunchao Zhang   aijkok->a_dual.modify_host();
157f0cf5187SStefano Zampini   PetscFunctionReturn(0);
158f0cf5187SStefano Zampini }
159f0cf5187SStefano Zampini 
160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
162a587d139SMark {
163042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
164042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
165a587d139SMark 
166a587d139SMark   PetscFunctionBegin;
167*076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
168152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
169a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
170a587d139SMark   PetscFunctionReturn(0);
171a587d139SMark }
172a587d139SMark 
173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
175a587d139SMark {
176042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
177a587d139SMark 
178a587d139SMark   PetscFunctionBegin;
179a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
180a587d139SMark     *d_mat = aijkok->device_mat_d.data();
181a587d139SMark   } else {
182a587d139SMark     PetscErrorCode   ierr;
183a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
184a587d139SMark     *d_mat  = NULL;
185a587d139SMark   }
186a587d139SMark   PetscFunctionReturn(0);
187a587d139SMark }
188*076ba34aSJunchao Zhang 
189*076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
190*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
191152b3e56SJunchao Zhang {
192152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
193152b3e56SJunchao Zhang 
194152b3e56SJunchao Zhang   PetscFunctionBegin;
195*076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
196*076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
197*076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
198*076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
199*076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
200*076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
20186a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
202*076ba34aSJunchao Zhang   }
203*076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
204152b3e56SJunchao Zhang   PetscFunctionReturn(0);
205152b3e56SJunchao Zhang }
206152b3e56SJunchao Zhang 
207*076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
208*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
209152b3e56SJunchao Zhang {
210152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
211152b3e56SJunchao Zhang 
212152b3e56SJunchao Zhang   PetscFunctionBegin;
213*076ba34aSJunchao Zhang   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
214*076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
215*076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
216*076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
217*076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
218*076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
219*076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
220*076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
221*076ba34aSJunchao Zhang    #endif
22286a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
223*076ba34aSJunchao Zhang   }
224*076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
225152b3e56SJunchao Zhang   PetscFunctionReturn(0);
226152b3e56SJunchao Zhang }
227a587d139SMark 
2288c3ff71bSJunchao Zhang /* y = A x */
2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2308c3ff71bSJunchao Zhang {
2318c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
233152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
234152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2358c3ff71bSJunchao Zhang 
2368c3ff71bSJunchao Zhang   PetscFunctionBegin;
2378c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
238152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
239152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
241152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
242152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
243152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
244bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
245*076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
246152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2478c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2488c3ff71bSJunchao Zhang }
2498c3ff71bSJunchao Zhang 
2508c3ff71bSJunchao Zhang /* y = A^T x */
2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2528c3ff71bSJunchao Zhang {
2538c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2548c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
255152b3e56SJunchao Zhang   const char                       *mode;
256152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
257152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
258*076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2598c3ff71bSJunchao Zhang 
2608c3ff71bSJunchao Zhang   PetscFunctionBegin;
2618c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
262152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
263152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
264152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
265*076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
266152b3e56SJunchao Zhang     mode = "N";
267152b3e56SJunchao Zhang   } else {
268*076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
269*076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
270152b3e56SJunchao Zhang     mode = "T";
271152b3e56SJunchao Zhang   }
272*076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
273152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
274152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
275bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
276*076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2788c3ff71bSJunchao Zhang }
2798c3ff71bSJunchao Zhang 
2808c3ff71bSJunchao Zhang /* y = A^H x */
2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2828c3ff71bSJunchao Zhang {
2838c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2848c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
285152b3e56SJunchao Zhang   const char                       *mode;
286152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
287152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
288*076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
2918c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
292152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
293152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
294152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
295*076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
296152b3e56SJunchao Zhang     mode = "N";
297152b3e56SJunchao Zhang   } else {
298*076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
299*076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
300152b3e56SJunchao Zhang     mode = "C";
301152b3e56SJunchao Zhang   }
302*076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
303152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
304152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
305bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
306*076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3078c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3088c3ff71bSJunchao Zhang }
3098c3ff71bSJunchao Zhang 
3108c3ff71bSJunchao Zhang /* z = A x + y */
3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3128c3ff71bSJunchao Zhang {
3138c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3148c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
315152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
316152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3178c3ff71bSJunchao Zhang 
3188c3ff71bSJunchao Zhang   PetscFunctionBegin;
3198c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
320152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
321152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
322152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3238c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3248c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
325152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
326152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
327152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
328152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
329bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
330152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3318c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3328c3ff71bSJunchao Zhang }
3338c3ff71bSJunchao Zhang 
3348c3ff71bSJunchao Zhang /* z = A^T x + y */
3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3368c3ff71bSJunchao Zhang {
3378c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3388c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
339152b3e56SJunchao Zhang   const char                       *mode;
340152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
341152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
342*076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3438c3ff71bSJunchao Zhang 
3448c3ff71bSJunchao Zhang   PetscFunctionBegin;
3458c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
346152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
348152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3498c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
350152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
351*076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr);
352152b3e56SJunchao Zhang     mode = "N";
353152b3e56SJunchao Zhang   } else {
354*076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355*076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
356152b3e56SJunchao Zhang     mode = "T";
357152b3e56SJunchao Zhang   }
358*076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
359152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
360152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
361152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
362bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
363*076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
3648c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3658c3ff71bSJunchao Zhang }
3668c3ff71bSJunchao Zhang 
3678c3ff71bSJunchao Zhang /* z = A^H x + y */
3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3698c3ff71bSJunchao Zhang {
3708c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3718c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
372152b3e56SJunchao Zhang   const char                       *mode;
373152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
374152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
375*076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3768c3ff71bSJunchao Zhang 
3778c3ff71bSJunchao Zhang   PetscFunctionBegin;
3788c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
379152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
380152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
381152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
383152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
384*076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr);
385152b3e56SJunchao Zhang     mode = "N";
386152b3e56SJunchao Zhang   } else {
387*076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
388*076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
389152b3e56SJunchao Zhang     mode = "C";
390152b3e56SJunchao Zhang   }
391*076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
392152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
393152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
394152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
395bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
396*076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr);
397152b3e56SJunchao Zhang   PetscFunctionReturn(0);
398152b3e56SJunchao Zhang }
399152b3e56SJunchao Zhang 
400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
401152b3e56SJunchao Zhang {
402152b3e56SJunchao Zhang   PetscErrorCode            ierr;
403152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
404152b3e56SJunchao Zhang 
405152b3e56SJunchao Zhang   PetscFunctionBegin;
406152b3e56SJunchao Zhang   switch (op) {
407152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
408152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
409152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
410152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
411152b3e56SJunchao Zhang       break;
412152b3e56SJunchao Zhang     default:
413152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
414152b3e56SJunchao Zhang       break;
415152b3e56SJunchao Zhang   }
4168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4178c3ff71bSJunchao Zhang }
4188c3ff71bSJunchao Zhang 
419*076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4218c3ff71bSJunchao Zhang {
4228c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
423*076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4248c3ff71bSJunchao Zhang 
4258c3ff71bSJunchao Zhang   PetscFunctionBegin;
426a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
427*076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
428*076ba34aSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */
4298c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
430*076ba34aSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */
431*076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
432*076ba34aSJunchao Zhang     if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
433*076ba34aSJunchao Zhang     ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
434*076ba34aSJunchao Zhang     ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
435*076ba34aSJunchao Zhang     ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
436*076ba34aSJunchao Zhang     ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr);
437*076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
438*076ba34aSJunchao Zhang     if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */
439*076ba34aSJunchao Zhang       if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
440*076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4418c3ff71bSJunchao Zhang     }
442*076ba34aSJunchao Zhang   }
4438c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4448c3ff71bSJunchao Zhang }
4458c3ff71bSJunchao Zhang 
446*076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
447*076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
448*076ba34aSJunchao Zhang  */
449*076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4508c3ff71bSJunchao Zhang {
4518c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
452*076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
453*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
454*076ba34aSJunchao Zhang   Mat                   mat;
4558c3ff71bSJunchao Zhang 
4568c3ff71bSJunchao Zhang   PetscFunctionBegin;
457*076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
458*076ba34aSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr);
459*076ba34aSJunchao Zhang   mat  = *B;
460*076ba34aSJunchao Zhang   if (A->assembled) {
461*076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
462*076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
463*076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
464*076ba34aSJunchao Zhang     /* Now copy values to B if needed */
465*076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
466*076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
467*076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
468*076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
469*076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
470*076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
471*076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
472*076ba34aSJunchao Zhang       }
473*076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
474*076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
475*076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
476*076ba34aSJunchao Zhang     }
477*076ba34aSJunchao Zhang     mat->spptr = bkok;
478*076ba34aSJunchao Zhang   }
479*076ba34aSJunchao Zhang 
480*076ba34aSJunchao Zhang   ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr);
481*076ba34aSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
482*076ba34aSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr);
483*076ba34aSJunchao Zhang   ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr);
4848c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4858c3ff71bSJunchao Zhang }
4868c3ff71bSJunchao Zhang 
4878c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
4888c3ff71bSJunchao Zhang {
4898c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
49086a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
4918c3ff71bSJunchao Zhang 
4928c3ff71bSJunchao Zhang   PetscFunctionBegin;
49386a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
49486a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
4958f7e8f9dSMark Adams     if (aijkok) {
4968f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
497a587d139SMark         delete aijkok->colmap_d;
498a587d139SMark         delete aijkok->i_uncompressed_d;
499a587d139SMark       }
5008f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
5018f7e8f9dSMark Adams     }
5028c3ff71bSJunchao Zhang     delete aijkok;
50386a27549SJunchao Zhang   } else {
50486a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
50586a27549SJunchao Zhang   }
506152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
507152b3e56SJunchao Zhang   A->spptr = NULL;
5088c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
5098c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5108c3ff71bSJunchao Zhang }
5118c3ff71bSJunchao Zhang 
51286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
51386a27549SJunchao Zhang {
51486a27549SJunchao Zhang   PetscErrorCode ierr;
51586a27549SJunchao Zhang 
51686a27549SJunchao Zhang   PetscFunctionBegin;
51786a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
51886a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
51986a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
52086a27549SJunchao Zhang   PetscFunctionReturn(0);
52186a27549SJunchao Zhang }
52286a27549SJunchao Zhang 
523*076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
524*076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
525a3f881fbSStefano Zampini {
526*076ba34aSJunchao Zhang   PetscErrorCode               ierr;
527*076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
528*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
529*076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
530*076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
531*076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
532*076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
533a3f881fbSStefano Zampini 
534a3f881fbSStefano Zampini   PetscFunctionBegin;
535*076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
536*076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
537*076ba34aSJunchao Zhang   PetscValidPointer(C,4);
538*076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
539*076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
540*076ba34aSJunchao Zhang   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
541*076ba34aSJunchao Zhang   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
542*076ba34aSJunchao Zhang 
543*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
545*076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
546*076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
547*076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548*076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
549*076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
550*076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
551*076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
552*076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
553*076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
554*076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
555*076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
556*076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
557*076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
558*076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
559*076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
560*076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
561*076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
562*076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
563*076ba34aSJunchao Zhang     ca = ca_dual.view_device();
564*076ba34aSJunchao Zhang     ci = ci_dual.view_device();
565*076ba34aSJunchao Zhang     cj = cj_dual.view_device();
566*076ba34aSJunchao Zhang 
567*076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
568*076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
569*076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
570*076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
571*076ba34aSJunchao Zhang 
572*076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
573*076ba34aSJunchao Zhang         ci(i) = coffset;
574*076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
575*076ba34aSJunchao Zhang       });
576*076ba34aSJunchao Zhang 
577*076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
578*076ba34aSJunchao Zhang         if (k < alen) {
579*076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
580*076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
581*076ba34aSJunchao Zhang         } else {
582*076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
583*076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
584*076ba34aSJunchao Zhang         }
585*076ba34aSJunchao Zhang       });
586*076ba34aSJunchao Zhang     });
587*076ba34aSJunchao Zhang     ca_dual.modify_device();
588*076ba34aSJunchao Zhang     ci_dual.modify_device();
589*076ba34aSJunchao Zhang     cj_dual.modify_device();
590*076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
591*076ba34aSJunchao Zhang     ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr);
592*076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
593*076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
594*076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
595*076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
596*076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
597*076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
598*076ba34aSJunchao Zhang 
599*076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
600*076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
601*076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
602*076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
603*076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
604*076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
605*076ba34aSJunchao Zhang       });
606*076ba34aSJunchao Zhang     });
607*076ba34aSJunchao Zhang     ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr);
608*076ba34aSJunchao Zhang   }
609*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
610*076ba34aSJunchao Zhang }
611*076ba34aSJunchao Zhang 
612*076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
613*076ba34aSJunchao Zhang {
614*076ba34aSJunchao Zhang   PetscFunctionBegin;
615*076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
616a3f881fbSStefano Zampini   PetscFunctionReturn(0);
617a3f881fbSStefano Zampini }
618a3f881fbSStefano Zampini 
619a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
620a3f881fbSStefano Zampini {
621*076ba34aSJunchao Zhang   PetscErrorCode                 ierr;
622a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
623a3f881fbSStefano Zampini   Mat                            A,B;
624*076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
625a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
626a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
627*076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
628*076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
629a3f881fbSStefano Zampini 
630a3f881fbSStefano Zampini   PetscFunctionBegin;
631a3f881fbSStefano Zampini   MatCheckProduct(C,1);
632a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
633*076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
634*076ba34aSJunchao Zhang 
635*076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
636*076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
637*076ba34aSJunchao Zhang     PetscFunctionReturn(0);
638*076ba34aSJunchao Zhang   }
639*076ba34aSJunchao Zhang 
640*076ba34aSJunchao Zhang   switch (product->type) {
641*076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
642*076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
643*076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
644*076ba34aSJunchao Zhang     default:
645*076ba34aSJunchao Zhang       SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
646*076ba34aSJunchao Zhang   }
647*076ba34aSJunchao Zhang 
648a3f881fbSStefano Zampini   A     = product->A;
649a3f881fbSStefano Zampini   B     = product->B;
650a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
651a3f881fbSStefano Zampini   ierr  = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
652a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
653a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
654a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
655*076ba34aSJunchao Zhang 
656*076ba34aSJunchao Zhang   if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
657*076ba34aSJunchao Zhang 
658*076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
659*076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
660*076ba34aSJunchao Zhang 
661*076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
662*076ba34aSJunchao Zhang   if (transA) {
663*076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
664*076ba34aSJunchao Zhang     transA = false;
665a3f881fbSStefano Zampini   }
666a3f881fbSStefano Zampini 
667*076ba34aSJunchao Zhang   if (transB) {
668*076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
669*076ba34aSJunchao Zhang     transB = false;
670*076ba34aSJunchao Zhang   }
671*076ba34aSJunchao Zhang 
672*076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
673*076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
674*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr);
675a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
676a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
677a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
678a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
679a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
680a3f881fbSStefano Zampini   c->reallocs         = 0;
681*076ba34aSJunchao Zhang   C->info.mallocs     = 0;
682a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
683a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
684a3f881fbSStefano Zampini   C->num_ass++;
685a3f881fbSStefano Zampini   PetscFunctionReturn(0);
686a3f881fbSStefano Zampini }
687a3f881fbSStefano Zampini 
688a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
689a3f881fbSStefano Zampini {
690a3f881fbSStefano Zampini   PetscErrorCode                 ierr;
691*076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
692*076ba34aSJunchao Zhang   MatProductType                 ptype;
693*076ba34aSJunchao Zhang   Mat                            A,B;
694*076ba34aSJunchao Zhang   bool                           transA,transB;
695*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
696*076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
697*076ba34aSJunchao Zhang   MPI_Comm                       comm;
698*076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
699a3f881fbSStefano Zampini 
700a3f881fbSStefano Zampini   PetscFunctionBegin;
701a3f881fbSStefano Zampini   MatCheckProduct(C,1);
702*076ba34aSJunchao Zhang   ierr = PetscObjectGetComm((PetscObject)C,&comm);
703*076ba34aSJunchao Zhang   if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty");
704a3f881fbSStefano Zampini   A       = product->A;
705a3f881fbSStefano Zampini   B       = product->B;
706a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
707a3f881fbSStefano Zampini   ierr    = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
708a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
709a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
710*076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
711*076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
712*076ba34aSJunchao Zhang 
713a3f881fbSStefano Zampini   ptype   = product->type;
714a3f881fbSStefano Zampini   switch (ptype) {
715*076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
716*076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
717*076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
718a3f881fbSStefano Zampini     default:
719*076ba34aSJunchao Zhang       SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
720a3f881fbSStefano Zampini   }
721a3f881fbSStefano Zampini 
722*076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
723*076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
724*076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
725*076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
726a3f881fbSStefano Zampini 
727*076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
728*076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
729*076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
730*076ba34aSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
731*076ba34aSJunchao Zhang     /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */
732*076ba34aSJunchao Zhang     spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE;
733*076ba34aSJunchao Zhang   #endif
734*076ba34aSJunchao Zhang #endif
735*076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
736*076ba34aSJunchao Zhang 
737*076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
738*076ba34aSJunchao Zhang   if (transA) {
739*076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr);
740*076ba34aSJunchao Zhang     transA = false;
741*076ba34aSJunchao Zhang   }
742*076ba34aSJunchao Zhang 
743*076ba34aSJunchao Zhang   if (transB) {
744*076ba34aSJunchao Zhang     ierr   = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr);
745*076ba34aSJunchao Zhang     transB = false;
746*076ba34aSJunchao Zhang   }
747*076ba34aSJunchao Zhang 
748*076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
749*076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
750*076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
751*076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
752*076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
753*076ba34aSJunchao Zhang   */
754*076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
755*076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
756*076ba34aSJunchao Zhang 
757*076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
758*076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr);
759*076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
760a3f881fbSStefano Zampini   PetscFunctionReturn(0);
761a3f881fbSStefano Zampini }
762a3f881fbSStefano Zampini 
763a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
764*076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
765a3f881fbSStefano Zampini {
766a3f881fbSStefano Zampini   PetscErrorCode ierr;
767*076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
768a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
769a3f881fbSStefano Zampini 
770a3f881fbSStefano Zampini   PetscFunctionBegin;
771a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
772a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
773a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
774a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
775a3f881fbSStefano Zampini   }
776a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
777a3f881fbSStefano Zampini     switch (product->type) {
778a3f881fbSStefano Zampini     case MATPRODUCT_AB:
779a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
780a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
781a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
782a3f881fbSStefano Zampini       break;
783a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
784a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
785a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
786a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
787a3f881fbSStefano Zampini       break;
788a3f881fbSStefano Zampini     default:
789a3f881fbSStefano Zampini       break;
790a3f881fbSStefano Zampini     }
791a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
792a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
793a3f881fbSStefano Zampini   }
794a3f881fbSStefano Zampini   PetscFunctionReturn(0);
795a3f881fbSStefano Zampini }
796a587d139SMark 
797f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
798f0cf5187SStefano Zampini {
799f0cf5187SStefano Zampini   PetscErrorCode   ierr;
800f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
801f0cf5187SStefano Zampini 
802f0cf5187SStefano Zampini   PetscFunctionBegin;
803f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
804f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
805*076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
806*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
807f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
808*076ba34aSJunchao Zhang   ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr);
809f0cf5187SStefano Zampini   PetscFunctionReturn(0);
810f0cf5187SStefano Zampini }
811f0cf5187SStefano Zampini 
812a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
813a587d139SMark {
814a587d139SMark   PetscErrorCode   ierr;
815*076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
816a587d139SMark 
817a587d139SMark   PetscFunctionBegin;
818*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
819*076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
820*076ba34aSJunchao Zhang   KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
821*076ba34aSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */
822*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
823a587d139SMark   PetscFunctionReturn(0);
824a587d139SMark }
825a587d139SMark 
826db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
827db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
828db78de30SJunchao Zhang {
829db78de30SJunchao Zhang   PetscErrorCode     ierr;
830db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
831db78de30SJunchao Zhang 
832db78de30SJunchao Zhang   PetscFunctionBegin;
833db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
834db78de30SJunchao Zhang   PetscValidPointer(kv,2);
835db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
836db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
837db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
838*076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
839db78de30SJunchao Zhang   PetscFunctionReturn(0);
840db78de30SJunchao Zhang }
841db78de30SJunchao Zhang 
842db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
843db78de30SJunchao Zhang {
844db78de30SJunchao Zhang   PetscFunctionBegin;
845db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
846db78de30SJunchao Zhang   PetscValidPointer(kv,2);
847db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
848db78de30SJunchao Zhang   PetscFunctionReturn(0);
849db78de30SJunchao Zhang }
850db78de30SJunchao Zhang 
851db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
852db78de30SJunchao Zhang {
853db78de30SJunchao Zhang   PetscErrorCode     ierr;
854db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
855db78de30SJunchao Zhang 
856db78de30SJunchao Zhang   PetscFunctionBegin;
857db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
858db78de30SJunchao Zhang   PetscValidPointer(kv,2);
859db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
860db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
861db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
862*076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
863db78de30SJunchao Zhang   PetscFunctionReturn(0);
864db78de30SJunchao Zhang }
865db78de30SJunchao Zhang 
866db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
867db78de30SJunchao Zhang {
868db78de30SJunchao Zhang   PetscErrorCode     ierr;
869db78de30SJunchao Zhang 
870db78de30SJunchao Zhang   PetscFunctionBegin;
871db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
872db78de30SJunchao Zhang   PetscValidPointer(kv,2);
873db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
874*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
875db78de30SJunchao Zhang   PetscFunctionReturn(0);
876db78de30SJunchao Zhang }
877db78de30SJunchao Zhang 
878db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
879db78de30SJunchao Zhang {
880db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
881db78de30SJunchao Zhang 
882db78de30SJunchao Zhang   PetscFunctionBegin;
883db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
884db78de30SJunchao Zhang   PetscValidPointer(kv,2);
885db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
886db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
887*076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
888db78de30SJunchao Zhang   PetscFunctionReturn(0);
889db78de30SJunchao Zhang }
890db78de30SJunchao Zhang 
891db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
892db78de30SJunchao Zhang {
893db78de30SJunchao Zhang   PetscErrorCode     ierr;
894db78de30SJunchao Zhang 
895db78de30SJunchao Zhang   PetscFunctionBegin;
896db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
897db78de30SJunchao Zhang   PetscValidPointer(kv,2);
898db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
899*076ba34aSJunchao Zhang   ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr);
900db78de30SJunchao Zhang   PetscFunctionReturn(0);
901db78de30SJunchao Zhang }
902db78de30SJunchao Zhang 
903db78de30SJunchao Zhang /* Computes Y += a*X */
904f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
905a587d139SMark {
906a587d139SMark   PetscErrorCode ierr;
907a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
908a587d139SMark 
909a587d139SMark   PetscFunctionBegin;
910f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
911a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
912a587d139SMark     PetscFunctionReturn(0);
913a587d139SMark   }
914db78de30SJunchao Zhang 
915f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
916a587d139SMark     PetscBool e;
917a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
918a587d139SMark     if (e) {
919a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
920f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
921a587d139SMark     }
922a587d139SMark   }
923db78de30SJunchao Zhang 
924a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
925db78de30SJunchao Zhang     /* TODO: do MatAXPY on device */
926a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
927a587d139SMark     PetscFunctionReturn(0);
928a587d139SMark   } else {
929db78de30SJunchao Zhang     ConstPetscScalarKokkosView xv;
930db78de30SJunchao Zhang     PetscScalarKokkosView      yv;
931db78de30SJunchao Zhang 
932db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
933db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
934db78de30SJunchao Zhang     KokkosBlas::axpy(a,xv,yv);
935db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
936db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
937a587d139SMark   }
938a587d139SMark   PetscFunctionReturn(0);
939a587d139SMark }
940a587d139SMark 
94186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
9428f7e8f9dSMark Adams {
9438f7e8f9dSMark Adams   PetscErrorCode ierr;
9448f7e8f9dSMark Adams 
9458f7e8f9dSMark Adams   PetscFunctionBegin;
9468f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
9478f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
9488f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
9498f7e8f9dSMark Adams   PetscFunctionReturn(0);
9508f7e8f9dSMark Adams }
9518f7e8f9dSMark Adams 
9528c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
9538c3ff71bSJunchao Zhang {
954*076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
955*076ba34aSJunchao Zhang 
9568c3ff71bSJunchao Zhang   PetscFunctionBegin;
957*076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
9586f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
9596f3d89d0SStefano Zampini 
9608c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
9618c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
9628c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
963a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
964f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
965a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
966*076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
9678c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
9688c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
9698c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
9708c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
9718c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
9728c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
973*076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
974152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
975*076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
976*076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
977*076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
978*076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
979*076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
980*076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
981*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
982*076ba34aSJunchao Zhang }
983*076ba34aSJunchao Zhang 
984*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
985*076ba34aSJunchao Zhang {
986*076ba34aSJunchao Zhang   PetscErrorCode     ierr;
987*076ba34aSJunchao Zhang   Mat_SeqAIJ         *aseq;
988*076ba34aSJunchao Zhang   PetscInt           i,m,n;
989*076ba34aSJunchao Zhang 
990*076ba34aSJunchao Zhang   PetscFunctionBegin;
991*076ba34aSJunchao Zhang   if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
992*076ba34aSJunchao Zhang 
993*076ba34aSJunchao Zhang   m    = akok->nrows();
994*076ba34aSJunchao Zhang   n    = akok->ncols();
995*076ba34aSJunchao Zhang   ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr);
996*076ba34aSJunchao Zhang   ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
997*076ba34aSJunchao Zhang 
998*076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
999*076ba34aSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1000*076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1001*076ba34aSJunchao Zhang 
1002*076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1003*076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1004*076ba34aSJunchao Zhang 
1005*076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1006*076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1007*076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1008*076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1009*076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1010*076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1011*076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1012*076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1013*076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1014*076ba34aSJunchao Zhang 
1015*076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr);
1016*076ba34aSJunchao Zhang   ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr);
1017*076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1018*076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1019*076ba34aSJunchao Zhang   }
1020*076ba34aSJunchao Zhang 
1021*076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1022*076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1023*076ba34aSJunchao Zhang   ierr     = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
1024*076ba34aSJunchao Zhang   ierr     = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
1025*076ba34aSJunchao Zhang   A->spptr = akok;
1026*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1027*076ba34aSJunchao Zhang }
1028*076ba34aSJunchao Zhang 
1029*076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1030*076ba34aSJunchao Zhang 
1031*076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1032*076ba34aSJunchao Zhang  */
1033*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1034*076ba34aSJunchao Zhang {
1035*076ba34aSJunchao Zhang   PetscErrorCode     ierr;
1036*076ba34aSJunchao Zhang 
1037*076ba34aSJunchao Zhang   PetscFunctionBegin;
1038*076ba34aSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1039*076ba34aSJunchao Zhang   ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr);
10408c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
10418c3ff71bSJunchao Zhang }
10428c3ff71bSJunchao Zhang 
10438c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1044152b3e56SJunchao Zhang /*@C
10458c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
10468c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
10478c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
10488c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
10498c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
10508c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
10518c3ff71bSJunchao Zhang 
10528c3ff71bSJunchao Zhang    Collective
10538c3ff71bSJunchao Zhang 
10548c3ff71bSJunchao Zhang    Input Parameters:
10558c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
10568c3ff71bSJunchao Zhang .  m - number of rows
10578c3ff71bSJunchao Zhang .  n - number of columns
10588c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
10598c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
10608c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
10618c3ff71bSJunchao Zhang 
10628c3ff71bSJunchao Zhang    Output Parameter:
10638c3ff71bSJunchao Zhang .  A - the matrix
10648c3ff71bSJunchao Zhang 
10658c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
10668c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
10678c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
10688c3ff71bSJunchao Zhang 
10698c3ff71bSJunchao Zhang    Notes:
10708c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
10718c3ff71bSJunchao Zhang 
10728c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
10738c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
10748c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
10758c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
10768c3ff71bSJunchao Zhang 
10778c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
10788c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
10798c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
10808c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
10818c3ff71bSJunchao Zhang 
10828c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
10838c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
10848c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
10858c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
10868c3ff71bSJunchao Zhang 
10878c3ff71bSJunchao Zhang    Level: intermediate
10888c3ff71bSJunchao Zhang 
1089*076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
10908c3ff71bSJunchao Zhang @*/
10918c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
10928c3ff71bSJunchao Zhang {
10938c3ff71bSJunchao Zhang   PetscErrorCode ierr;
10948c3ff71bSJunchao Zhang 
10958c3ff71bSJunchao Zhang   PetscFunctionBegin;
10968c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
10978c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
10988c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
10998c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
11008c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
11018c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11028c3ff71bSJunchao Zhang }
1103930e68a5SMark Adams 
11048f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
11058f7e8f9dSMark Adams //
11068f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
11078f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
11088f7e8f9dSMark Adams //
11098f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1110930e68a5SMark Adams {
11118f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
11128f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
11138f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
1114930e68a5SMark Adams   PetscErrorCode     ierr;
11158f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1116*076ba34aSJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data();
1117*076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1118*076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
11198f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
11208f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
11218f7e8f9dSMark Adams   PetscContainer     container;
1122930e68a5SMark Adams 
1123930e68a5SMark Adams   PetscFunctionBegin;
11248f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
11258f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
11268f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
11278f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
11288f7e8f9dSMark Adams   if (container) {
11298f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
11308f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
11318f7e8f9dSMark Adams     Nf = (*pNf)%1000;
11328f7e8f9dSMark Adams     nv = (*pNf)/1000;
11338f7e8f9dSMark Adams     if (nv>0) nVec = nv;
11348f7e8f9dSMark Adams   } else Nf = 1;
11358f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
11368f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
11378f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
11388f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
11398f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
11408f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
11418f7e8f9dSMark Adams #endif
11428f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
11438f7e8f9dSMark Adams   {
11448f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
11458f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
11468f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
11478f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
11488f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
11498f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
11508f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
11518f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
11528f7e8f9dSMark Adams     size_t flops_h = 0.0;
11538f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
11548f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
11558f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
11568f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
11578f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
11588f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
11598f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
11608f7e8f9dSMark Adams     // Fill A --> fact
11618f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1162042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
11638f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
11648f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
11658f7e8f9dSMark Adams         // zero rows of B
11668f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
11678f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
11688f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
11698f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
11708f7e8f9dSMark Adams             /* zero (unfactored row) */
11718f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
11728f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
11738f7e8f9dSMark Adams           });
11748f7e8f9dSMark Adams         // copy A into B
11758f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
11768f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
11778f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
11788f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
11798f7e8f9dSMark Adams             /* load in initial (unfactored row) */
11808f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
11818f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
11828f7e8f9dSMark Adams               PetscScalar vala = av[j];
11838f7e8f9dSMark Adams               if (colb == rowb) {
11848f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
11858f7e8f9dSMark Adams               } else {
11868f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
11878f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
11888f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
11898f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
11908f7e8f9dSMark Adams                   if (pbj[j] == colb) {
11918f7e8f9dSMark Adams                     pba[j] = vala;
11928f7e8f9dSMark Adams                     set++;
11938f7e8f9dSMark Adams                     break;
11948f7e8f9dSMark Adams                   }
11958f7e8f9dSMark Adams                 }
11968f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
11978f7e8f9dSMark Adams               }
11988f7e8f9dSMark Adams             }
11998f7e8f9dSMark Adams           });
12008f7e8f9dSMark Adams       });
12018f7e8f9dSMark Adams     Kokkos::fence();
1202930e68a5SMark Adams 
12038f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
12048f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12058f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
12068f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1207042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
12088f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
12098f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
12108f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
12118f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
12128f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
12138f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
12148f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
12158f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
12168f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
12178f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
12188f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
12198f7e8f9dSMark Adams               else {
12208f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
12218f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
12228f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
12238f7e8f9dSMark Adams                 size_t         st_idx;
12248f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
12258f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
12268f7e8f9dSMark Adams                 // get column, there has got to be a better way
12278f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
12288f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
12298f7e8f9dSMark Adams                     if (pjL[j] == ii) {
12308f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
12318f7e8f9dSMark Adams                       idx = j; // output
12328f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
12338f7e8f9dSMark Adams                     }
12348f7e8f9dSMark Adams                   }, st_idx);
12358f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
12368f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
12378f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
12388f7e8f9dSMark Adams                   // U(i+1,:end)
12398f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
12408f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
12418f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
12428f7e8f9dSMark Adams                       if (col==myk) {
12438f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
12448f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
12458f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
12468f7e8f9dSMark Adams                       } else {
12478f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
12488f7e8f9dSMark Adams                         PetscInt       high, low;
12498f7e8f9dSMark Adams                         const PetscInt *startj;
12508f7e8f9dSMark Adams                         if (col<myk) { // L
12518f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
12528f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
12538f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
12548f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
12558f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
12568f7e8f9dSMark Adams                         } else {
12578f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
12588f7e8f9dSMark Adams                           start = ba_d + idx;
12598f7e8f9dSMark Adams                           startj= bj_d + idx;
12608f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
12618f7e8f9dSMark Adams                         }
12628f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
12638f7e8f9dSMark Adams                         low  = 0;
12648f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
12658f7e8f9dSMark Adams                         while (high-low > 5) {
12668f7e8f9dSMark Adams                           int t = (low+high)/2;
12678f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
12688f7e8f9dSMark Adams                           else                 low  = t;
12698f7e8f9dSMark Adams                         }
12708f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
12718f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
12728f7e8f9dSMark Adams                         }
12738f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
12748f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
12758f7e8f9dSMark Adams                       }
12768f7e8f9dSMark Adams                     });
12778f7e8f9dSMark Adams                 }
12788f7e8f9dSMark Adams               }
12798f7e8f9dSMark Adams             });
12808f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
12818f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
12828f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
12838f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
12848f7e8f9dSMark Adams       });
12858f7e8f9dSMark Adams     Kokkos::fence();
12868f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
12878f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
12888f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
12898f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
12908f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
12918f7e8f9dSMark Adams #endif
12928f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
12938f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
12948f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
12958f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
12968f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
12978f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
12988f7e8f9dSMark Adams             if (i < end) {
12998f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
13008f7e8f9dSMark Adams               *pv = 1.0/(*pv);
13018f7e8f9dSMark Adams             }
13028f7e8f9dSMark Adams           });
13038f7e8f9dSMark Adams       });
13048f7e8f9dSMark Adams   }
13058f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
13068f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
13078f7e8f9dSMark Adams #endif
13088f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
13098f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
13108f7e8f9dSMark Adams 
13118f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
13128f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
13138f7e8f9dSMark Adams   if (b->inode.size) {
13148f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
13158f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
13168f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
13178f7e8f9dSMark Adams   } else {
13188f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
13198f7e8f9dSMark Adams   }
13208f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
13218f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
13228f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
13238f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
13248f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
13258f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
13268f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
13278f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
13288f7e8f9dSMark Adams 
1329930e68a5SMark Adams   PetscFunctionReturn(0);
1330930e68a5SMark Adams }
1331930e68a5SMark Adams 
133286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1333930e68a5SMark Adams {
1334930e68a5SMark Adams   PetscErrorCode   ierr;
1335930e68a5SMark Adams 
1336930e68a5SMark Adams   PetscFunctionBegin;
1337930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
133886a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
133986a27549SJunchao Zhang   PetscFunctionReturn(0);
134086a27549SJunchao Zhang }
134186a27549SJunchao Zhang 
134286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
134386a27549SJunchao Zhang {
134486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
134586a27549SJunchao Zhang 
134686a27549SJunchao Zhang   PetscFunctionBegin;
134786a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
134886a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
134986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
135086a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
135186a27549SJunchao Zhang   }
135286a27549SJunchao Zhang   PetscFunctionReturn(0);
135386a27549SJunchao Zhang }
135486a27549SJunchao Zhang 
135586a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
135686a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
135786a27549SJunchao Zhang {
135886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1359*076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
136086a27549SJunchao Zhang 
136186a27549SJunchao Zhang   PetscFunctionBegin;
136286a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
136386a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1364*076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
136586a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1366*076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1367*076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
136886a27549SJunchao Zhang 
136986a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1370*076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1371*076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1372*076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
137386a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
137486a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
137586a27549SJunchao Zhang 
137686a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
137786a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
137886a27549SJunchao Zhang     */
1379*076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1380*076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
138186a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
138286a27549SJunchao Zhang 
138386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
138486a27549SJunchao Zhang 
138586a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1386*076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
138786a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1388*076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1389*076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
139086a27549SJunchao Zhang 
139186a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1392*076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1393*076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1394*076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
139586a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
139686a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
139786a27549SJunchao Zhang 
139886a27549SJunchao Zhang     /* Sort indices. See comments above */
1399*076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1400*076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
140186a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
140286a27549SJunchao Zhang 
140386a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
140486a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
140586a27549SJunchao Zhang   }
140686a27549SJunchao Zhang   PetscFunctionReturn(0);
140786a27549SJunchao Zhang }
140886a27549SJunchao Zhang 
140986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
141086a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
141186a27549SJunchao Zhang {
141286a27549SJunchao Zhang   PetscErrorCode                 ierr;
141386a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
141486a27549SJunchao Zhang   PetscScalarKokkosView          xv;
141586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
141686a27549SJunchao Zhang 
141786a27549SJunchao Zhang   PetscFunctionBegin;
141886a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
141986a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
142086a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
142186a27549SJunchao Zhang   /* Solve L tmpv = b */
14223f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
142386a27549SJunchao Zhang   /* Solve Ux = tmpv */
14243f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
142586a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
142686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
142786a27549SJunchao Zhang   PetscFunctionReturn(0);
142886a27549SJunchao Zhang }
142986a27549SJunchao Zhang 
1430*076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
143186a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
143286a27549SJunchao Zhang {
143386a27549SJunchao Zhang   PetscErrorCode                 ierr;
143486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
143586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
143686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
143786a27549SJunchao Zhang 
143886a27549SJunchao Zhang   PetscFunctionBegin;
143986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
144086a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
144186a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
144286a27549SJunchao Zhang   /* Solve U^T tmpv = b */
144386a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
144486a27549SJunchao Zhang 
144586a27549SJunchao Zhang   /* Solve L^T x = tmpv */
144686a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
144786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
144886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
144986a27549SJunchao Zhang   PetscFunctionReturn(0);
145086a27549SJunchao Zhang }
145186a27549SJunchao Zhang 
145286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
145386a27549SJunchao Zhang {
145486a27549SJunchao Zhang   PetscErrorCode                 ierr;
145586a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
145686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
145786a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
145886a27549SJunchao Zhang 
145986a27549SJunchao Zhang   PetscFunctionBegin;
146086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1461*076ba34aSJunchao Zhang 
1462*076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1463*076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1464*076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1465*076ba34aSJunchao Zhang 
1466*076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
146786a27549SJunchao Zhang 
146886a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
146986a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
147086a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
147186a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
147286a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
147386a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
147486a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
147586a27549SJunchao Zhang 
147686a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
147786a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
147886a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
147986a27549SJunchao Zhang   PetscFunctionReturn(0);
148086a27549SJunchao Zhang }
148186a27549SJunchao Zhang 
148286a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
148386a27549SJunchao Zhang {
148486a27549SJunchao Zhang   PetscErrorCode                 ierr;
148586a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
148686a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
148786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
148886a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
148986a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
149086a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
149186a27549SJunchao Zhang 
149286a27549SJunchao Zhang   PetscFunctionBegin;
149386a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
149486a27549SJunchao Zhang   /* Rebuild factors */
149586a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
149686a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
149786a27549SJunchao Zhang 
149886a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
149986a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
150086a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
150186a27549SJunchao Zhang 
150286a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
150386a27549SJunchao Zhang 
150486a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
150586a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
150686a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
150786a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
150886a27549SJunchao Zhang 
150986a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1510*076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1511*076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1512*076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
151386a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
151486a27549SJunchao Zhang 
151586a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
151686a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
151786a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
151886a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
151986a27549SJunchao Zhang 
152086a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
152186a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
152286a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
152386a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
152486a27549SJunchao Zhang  #else
152586a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
152686a27549SJunchao Zhang  #endif
152786a27549SJunchao Zhang 
152886a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
152986a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
153086a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
153186a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
153286a27549SJunchao Zhang 
153386a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
153486a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
153586a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
153686a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
153786a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
153886a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
153986a27549SJunchao Zhang 
154086a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
154186a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1542930e68a5SMark Adams   PetscFunctionReturn(0);
1543930e68a5SMark Adams }
1544930e68a5SMark Adams 
15458f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
15468f7e8f9dSMark Adams {
15478f7e8f9dSMark Adams   PetscErrorCode   ierr;
15488f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
15498f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1550930e68a5SMark Adams 
15518f7e8f9dSMark Adams   PetscFunctionBegin;
15528f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
15538f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
15548f7e8f9dSMark Adams   // move B data into Kokkos
15558f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
15568f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
15578f7e8f9dSMark Adams   {
15588f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
15598f7e8f9dSMark Adams     if (!baijkok->diag_d) {
15608f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1561152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
15628f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
15638f7e8f9dSMark Adams     }
15648f7e8f9dSMark Adams   }
15658f7e8f9dSMark Adams   PetscFunctionReturn(0);
15668f7e8f9dSMark Adams }
15678f7e8f9dSMark Adams 
156886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1569930e68a5SMark Adams {
1570930e68a5SMark Adams   PetscFunctionBegin;
1571930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1572930e68a5SMark Adams   PetscFunctionReturn(0);
1573930e68a5SMark Adams }
1574930e68a5SMark Adams 
15758f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
15768f7e8f9dSMark Adams {
15778f7e8f9dSMark Adams   PetscFunctionBegin;
15788f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
15798f7e8f9dSMark Adams   PetscFunctionReturn(0);
15808f7e8f9dSMark Adams }
15818f7e8f9dSMark Adams 
1582930e68a5SMark Adams /*MC
158386a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
158486a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1585930e68a5SMark Adams 
1586930e68a5SMark Adams   Level: beginner
1587930e68a5SMark Adams 
158886a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1589930e68a5SMark Adams M*/
159086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1591930e68a5SMark Adams {
1592930e68a5SMark Adams   PetscErrorCode ierr;
1593930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1594930e68a5SMark Adams 
1595930e68a5SMark Adams   PetscFunctionBegin;
1596930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1597930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1598930e68a5SMark Adams   (*B)->factortype = ftype;
15994ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1600930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1601930e68a5SMark Adams 
16028f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1603930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
160486a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
160586a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
160686a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
160786a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
160886a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
160986a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
161086a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1611930e68a5SMark Adams 
1612930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
161386a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1614930e68a5SMark Adams   PetscFunctionReturn(0);
1615930e68a5SMark Adams }
16168f7e8f9dSMark Adams 
16178f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
16188f7e8f9dSMark Adams {
16198f7e8f9dSMark Adams   PetscErrorCode ierr;
16208f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
16218f7e8f9dSMark Adams 
16228f7e8f9dSMark Adams   PetscFunctionBegin;
16238f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
16248f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
16258f7e8f9dSMark Adams   (*B)->factortype = ftype;
1626f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
16274ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
16288f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
16298f7e8f9dSMark Adams 
16308f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
16318f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
16328f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
16338f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
16348f7e8f9dSMark Adams 
16358f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
16368f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
16378f7e8f9dSMark Adams   PetscFunctionReturn(0);
16388f7e8f9dSMark Adams }
163986a27549SJunchao Zhang 
164086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
164186a27549SJunchao Zhang {
164286a27549SJunchao Zhang   PetscErrorCode ierr;
164386a27549SJunchao Zhang 
164486a27549SJunchao Zhang   PetscFunctionBegin;
164586a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
164686a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
164786a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
164886a27549SJunchao Zhang   PetscFunctionReturn(0);
164986a27549SJunchao Zhang }
165086a27549SJunchao Zhang 
1651*076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1652*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1653*076ba34aSJunchao Zhang {
1654*076ba34aSJunchao Zhang   PetscErrorCode    ierr;
1655*076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1656*076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1657*076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1658*076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1659*076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1660*076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1661*076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1662*076ba34aSJunchao Zhang 
1663*076ba34aSJunchao Zhang   PetscFunctionBegin;
1664*076ba34aSJunchao Zhang   ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr);
1665*076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1666*076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr);
1667*076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1668*076ba34aSJunchao Zhang       ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr);
1669*076ba34aSJunchao Zhang     }
1670*076ba34aSJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr);
1671*076ba34aSJunchao Zhang   }
1672*076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1673*076ba34aSJunchao Zhang }
1674