xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2076ba34aSJunchao Zhang #include <petscpkg_version.h>
3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
442550becSJunchao Zhang #include <petsc/private/sfimpl.h>
58c3ff71bSJunchao Zhang #include <petscsystypes.h>
68c3ff71bSJunchao Zhang #include <petscerror.h>
78c3ff71bSJunchao Zhang 
88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
9f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp>
118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp>
16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp>
1786a27549SJunchao Zhang 
1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
218c3ff71bSJunchao Zhang 
22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
23076ba34aSJunchao Zhang    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
24076ba34aSJunchao Zhang    In the latter case, it is important to set a_dual's sync state correctly.
25076ba34aSJunchao Zhang  */
268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
278c3ff71bSJunchao Zhang {
28076ba34aSJunchao Zhang   Mat_SeqAIJ       *aijseq;
29076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang   PetscFunctionBegin;
32076ba34aSJunchao Zhang   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
33*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode));
34076ba34aSJunchao Zhang 
35076ba34aSJunchao Zhang   aijseq = static_cast<Mat_SeqAIJ*>(A->data);
36076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
37076ba34aSJunchao Zhang 
38076ba34aSJunchao Zhang   /* If aijkok does not exist, we just copy i, j to device.
39076ba34aSJunchao Zhang      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
40076ba34aSJunchao Zhang      In both cases, we build a new aijkok structure.
41076ba34aSJunchao Zhang   */
42076ba34aSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
43076ba34aSJunchao Zhang     delete aijkok;
44076ba34aSJunchao Zhang     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/);
45076ba34aSJunchao Zhang     A->spptr = aijkok;
46076ba34aSJunchao Zhang   }
47076ba34aSJunchao Zhang 
48a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
49a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
50a587d139SMark   }
518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
528c3ff71bSJunchao Zhang }
538c3ff71bSJunchao Zhang 
5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */
55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
568c3ff71bSJunchao Zhang {
578c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
588c3ff71bSJunchao Zhang 
598c3ff71bSJunchao Zhang   PetscFunctionBegin;
60*5f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
61*5f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
62076ba34aSJunchao Zhang   if (aijkok->a_dual.need_sync_device()) {
63076ba34aSJunchao Zhang     aijkok->a_dual.sync_device();
64580c7c76SPierre Jolivet     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
6586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
668c3ff71bSJunchao Zhang   }
678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
688c3ff71bSJunchao Zhang }
698c3ff71bSJunchao Zhang 
70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */
71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
7286a27549SJunchao Zhang {
7386a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
7486a27549SJunchao Zhang 
7586a27549SJunchao Zhang   PetscFunctionBegin;
76*5f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
7786a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
7886a27549SJunchao Zhang   aijkok->a_dual.modify_device();
7986a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
8086a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
81*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
82*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
8386a27549SJunchao Zhang   PetscFunctionReturn(0);
8486a27549SJunchao Zhang }
8586a27549SJunchao Zhang 
86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
87f0cf5187SStefano Zampini {
88f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
89f0cf5187SStefano Zampini 
90f0cf5187SStefano Zampini   PetscFunctionBegin;
91f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
9286a27549SJunchao Zhang   /* We do not expect one needs factors on host  */
93*5f80ce2aSJacob Faibussowitsch   PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
94*5f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
95076ba34aSJunchao Zhang   aijkok->a_dual.sync_host();
96f0cf5187SStefano Zampini   PetscFunctionReturn(0);
97f0cf5187SStefano Zampini }
98f0cf5187SStefano Zampini 
99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
100f0cf5187SStefano Zampini {
101076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
102f0cf5187SStefano Zampini 
103f0cf5187SStefano Zampini   PetscFunctionBegin;
104076ba34aSJunchao Zhang   if (aijkok) {
105076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
106076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
107076ba34aSJunchao Zhang   } else { /* Happens when calling MatSetValues on a newly created matrix */
108076ba34aSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
109076ba34aSJunchao Zhang   }
110076ba34aSJunchao Zhang   PetscFunctionReturn(0);
111076ba34aSJunchao Zhang }
112076ba34aSJunchao Zhang 
113076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
114076ba34aSJunchao Zhang {
115076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
116076ba34aSJunchao Zhang 
117076ba34aSJunchao Zhang   PetscFunctionBegin;
118076ba34aSJunchao Zhang   if (aijkok) aijkok->a_dual.modify_host();
119076ba34aSJunchao Zhang   PetscFunctionReturn(0);
120076ba34aSJunchao Zhang }
121076ba34aSJunchao Zhang 
122076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
123076ba34aSJunchao Zhang {
124076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
125076ba34aSJunchao Zhang 
126076ba34aSJunchao Zhang   PetscFunctionBegin;
1272328674fSJunchao Zhang   if (aijkok) {
128076ba34aSJunchao Zhang     aijkok->a_dual.sync_host();
129076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1302328674fSJunchao Zhang   } else {
1312328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1322328674fSJunchao Zhang   }
133076ba34aSJunchao Zhang   PetscFunctionReturn(0);
134076ba34aSJunchao Zhang }
135076ba34aSJunchao Zhang 
136076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[])
137076ba34aSJunchao Zhang {
138076ba34aSJunchao Zhang   PetscFunctionBegin;
139076ba34aSJunchao Zhang   *array = NULL;
140076ba34aSJunchao Zhang   PetscFunctionReturn(0);
141076ba34aSJunchao Zhang }
142076ba34aSJunchao Zhang 
143076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
144076ba34aSJunchao Zhang {
145076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
146076ba34aSJunchao Zhang 
147076ba34aSJunchao Zhang   PetscFunctionBegin;
1482328674fSJunchao Zhang   if (aijkok) {
149076ba34aSJunchao Zhang     *array = aijkok->a_dual.view_host().data();
1502328674fSJunchao Zhang   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
1512328674fSJunchao Zhang     *array = static_cast<Mat_SeqAIJ*>(A->data)->a;
1522328674fSJunchao Zhang   }
153076ba34aSJunchao Zhang   PetscFunctionReturn(0);
154076ba34aSJunchao Zhang }
155076ba34aSJunchao Zhang 
156076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[])
157076ba34aSJunchao Zhang {
158076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
159076ba34aSJunchao Zhang 
160076ba34aSJunchao Zhang   PetscFunctionBegin;
1612328674fSJunchao Zhang   if (aijkok) {
162076ba34aSJunchao Zhang     aijkok->a_dual.clear_sync_state();
163076ba34aSJunchao Zhang     aijkok->a_dual.modify_host();
1642328674fSJunchao Zhang   }
165f0cf5187SStefano Zampini   PetscFunctionReturn(0);
166f0cf5187SStefano Zampini }
167f0cf5187SStefano Zampini 
168a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
169042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat)
170a587d139SMark {
171042217e8SBarry Smith   Mat_SeqAIJKokkos                             *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
172042217e8SBarry Smith   Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat);
173a587d139SMark 
174a587d139SMark   PetscFunctionBegin;
175*5f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
176152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
177a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
178a587d139SMark   PetscFunctionReturn(0);
179a587d139SMark }
180a587d139SMark 
181a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
182042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat)
183a587d139SMark {
184042217e8SBarry Smith   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
185a587d139SMark 
186a587d139SMark   PetscFunctionBegin;
187a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
188a587d139SMark     *d_mat = aijkok->device_mat_d.data();
189a587d139SMark   } else {
190*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it)
191a587d139SMark     *d_mat  = NULL;
192a587d139SMark   }
193a587d139SMark   PetscFunctionReturn(0);
194a587d139SMark }
195076ba34aSJunchao Zhang 
196076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */
197076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT)
198152b3e56SJunchao Zhang {
199152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
200152b3e56SJunchao Zhang 
201152b3e56SJunchao Zhang   PetscFunctionBegin;
202*5f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
203076ba34aSJunchao Zhang   if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */
204076ba34aSJunchao Zhang     /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */
205076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
206076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
207076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT));
20886a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_TRUE;
209076ba34aSJunchao Zhang   }
210076ba34aSJunchao Zhang   *csrmatT = &aijkok->csrmatT;
211152b3e56SJunchao Zhang   PetscFunctionReturn(0);
212152b3e56SJunchao Zhang }
213152b3e56SJunchao Zhang 
214076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */
215076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH)
216152b3e56SJunchao Zhang {
217152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
218152b3e56SJunchao Zhang 
219152b3e56SJunchao Zhang   PetscFunctionBegin;
220*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
221*5f80ce2aSJacob Faibussowitsch   PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
222076ba34aSJunchao Zhang   if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */
223076ba34aSJunchao Zhang     CHKERRCXX(aijkok->a_dual.sync_device());
224076ba34aSJunchao Zhang     CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat));
225076ba34aSJunchao Zhang     CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH));
226076ba34aSJunchao Zhang    #if defined(PETSC_USE_COMPLEX)
227076ba34aSJunchao Zhang     const auto& a = aijkok->csrmatH.values;
228076ba34aSJunchao Zhang     Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));});
229076ba34aSJunchao Zhang    #endif
23086a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_TRUE;
231076ba34aSJunchao Zhang   }
232076ba34aSJunchao Zhang   *csrmatH = &aijkok->csrmatH;
233*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
234152b3e56SJunchao Zhang   PetscFunctionReturn(0);
235152b3e56SJunchao Zhang }
236a587d139SMark 
2378c3ff71bSJunchao Zhang /* y = A x */
2388c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2398c3ff71bSJunchao Zhang {
2408c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
241152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
242152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2438c3ff71bSJunchao Zhang 
2448c3ff71bSJunchao Zhang   PetscFunctionBegin;
245*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
246*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
247*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
248*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(yy,&yv));
2498c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
250152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
251*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
252*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv));
253076ba34aSJunchao Zhang   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
254*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
255*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
2568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2578c3ff71bSJunchao Zhang }
2588c3ff71bSJunchao Zhang 
2598c3ff71bSJunchao Zhang /* y = A^T x */
2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2618c3ff71bSJunchao Zhang {
2628c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
263152b3e56SJunchao Zhang   const char                       *mode;
264152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
265152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
266076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2678c3ff71bSJunchao Zhang 
2688c3ff71bSJunchao Zhang   PetscFunctionBegin;
269*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
270*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
271*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
272*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(yy,&yv));
273152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
274*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
275152b3e56SJunchao Zhang     mode = "N";
276152b3e56SJunchao Zhang   } else {
277076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
279152b3e56SJunchao Zhang     mode = "T";
280152b3e56SJunchao Zhang   }
281076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
282*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
283*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv));
284*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz()));
285*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
2868c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2878c3ff71bSJunchao Zhang }
2888c3ff71bSJunchao Zhang 
2898c3ff71bSJunchao Zhang /* y = A^H x */
2908c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2918c3ff71bSJunchao Zhang {
2928c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
293152b3e56SJunchao Zhang   const char                       *mode;
294152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
295152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
296076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
2978c3ff71bSJunchao Zhang 
2988c3ff71bSJunchao Zhang   PetscFunctionBegin;
299*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
300*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
301*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
302*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(yy,&yv));
303152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
304*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
305152b3e56SJunchao Zhang     mode = "N";
306152b3e56SJunchao Zhang   } else {
307076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
308076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
309152b3e56SJunchao Zhang     mode = "C";
310152b3e56SJunchao Zhang   }
311076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
312*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
313*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(yy,&yv));
314*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz()));
315*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
3168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3178c3ff71bSJunchao Zhang }
3188c3ff71bSJunchao Zhang 
3198c3ff71bSJunchao Zhang /* z = A x + y */
3208c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
3218c3ff71bSJunchao Zhang {
3228c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
323152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
324152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3258c3ff71bSJunchao Zhang 
3268c3ff71bSJunchao Zhang   PetscFunctionBegin;
327*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
328*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
329*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
330*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(yy,&yv));
331*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(zz,&zv));
3328c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
3338c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
334152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
335*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
336*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(yy,&yv));
337*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv));
338*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz()));
339*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
3408c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3418c3ff71bSJunchao Zhang }
3428c3ff71bSJunchao Zhang 
3438c3ff71bSJunchao Zhang /* z = A^T x + y */
3448c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3458c3ff71bSJunchao Zhang {
3468c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
347152b3e56SJunchao Zhang   const char                       *mode;
348152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
349152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
350076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3518c3ff71bSJunchao Zhang 
3528c3ff71bSJunchao Zhang   PetscFunctionBegin;
353*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
354*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
355*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
356*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(yy,&yv));
357*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(zz,&zv));
3588c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
359152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
360*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat));
361152b3e56SJunchao Zhang     mode = "N";
362152b3e56SJunchao Zhang   } else {
363076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
364076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
365152b3e56SJunchao Zhang     mode = "T";
366152b3e56SJunchao Zhang   }
367076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
368*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
369*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(yy,&yv));
370*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv));
371*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz()));
372*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
3738c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3748c3ff71bSJunchao Zhang }
3758c3ff71bSJunchao Zhang 
3768c3ff71bSJunchao Zhang /* z = A^H x + y */
3778c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3788c3ff71bSJunchao Zhang {
3798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
380152b3e56SJunchao Zhang   const char                       *mode;
381152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
382152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
383076ba34aSJunchao Zhang   KokkosCsrMatrix                  *csrmat;
3848c3ff71bSJunchao Zhang 
3858c3ff71bSJunchao Zhang   PetscFunctionBegin;
386*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
387*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
388*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(xx,&xv));
389*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(yy,&yv));
390*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(zz,&zv));
3918c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
392152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
393*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat));
394152b3e56SJunchao Zhang     mode = "N";
395152b3e56SJunchao Zhang   } else {
396076ba34aSJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
397076ba34aSJunchao Zhang     csrmat = &aijkok->csrmat;
398152b3e56SJunchao Zhang     mode = "C";
399152b3e56SJunchao Zhang   }
400076ba34aSJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
401*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(xx,&xv));
402*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(yy,&yv));
403*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(zz,&zv));
404*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*csrmat->nnz()));
405*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
406152b3e56SJunchao Zhang   PetscFunctionReturn(0);
407152b3e56SJunchao Zhang }
408152b3e56SJunchao Zhang 
409152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
410152b3e56SJunchao Zhang {
411152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
412152b3e56SJunchao Zhang 
413152b3e56SJunchao Zhang   PetscFunctionBegin;
414152b3e56SJunchao Zhang   switch (op) {
415152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
416152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
417*5f80ce2aSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg && aijkok) CHKERRQ(aijkok->DestroyMatTranspose());
418152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
419152b3e56SJunchao Zhang       break;
420152b3e56SJunchao Zhang     default:
421*5f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetOption_SeqAIJ(A,op,flg));
422152b3e56SJunchao Zhang       break;
423152b3e56SJunchao Zhang   }
4248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4258c3ff71bSJunchao Zhang }
4268c3ff71bSJunchao Zhang 
427076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */
4283d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4298c3ff71bSJunchao Zhang {
430076ba34aSJunchao Zhang   Mat_SeqAIJ       *aseq;
4318c3ff71bSJunchao Zhang 
4328c3ff71bSJunchao Zhang   PetscFunctionBegin;
433*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscKokkosInitializeCheck());
434076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
435*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */
4368c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
437*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
438076ba34aSJunchao Zhang   } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
439*5f80ce2aSJacob Faibussowitsch     PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX");
440*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(A->defaultvectype));
441*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */
442*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS));
443*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetOps_SeqAIJKokkos(A));
444076ba34aSJunchao Zhang     aseq = static_cast<Mat_SeqAIJ*>(A->data);
445394ed5ebSJunchao Zhang     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
446*5f80ce2aSJacob Faibussowitsch       PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
447076ba34aSJunchao Zhang       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE);
4488c3ff71bSJunchao Zhang     }
449076ba34aSJunchao Zhang   }
4508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4518c3ff71bSJunchao Zhang }
4528c3ff71bSJunchao Zhang 
453076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
454076ba34aSJunchao Zhang    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
455076ba34aSJunchao Zhang  */
456076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B)
4578c3ff71bSJunchao Zhang {
458076ba34aSJunchao Zhang   Mat_SeqAIJ            *bseq;
459076ba34aSJunchao Zhang   Mat_SeqAIJKokkos      *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok;
460076ba34aSJunchao Zhang   Mat                   mat;
4618c3ff71bSJunchao Zhang 
4628c3ff71bSJunchao Zhang   PetscFunctionBegin;
463076ba34aSJunchao Zhang   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
464*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B));
465076ba34aSJunchao Zhang   mat  = *B;
466076ba34aSJunchao Zhang   if (A->assembled) {
467076ba34aSJunchao Zhang     bseq = static_cast<Mat_SeqAIJ*>(mat->data);
468076ba34aSJunchao Zhang     bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE);
469076ba34aSJunchao Zhang     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
470076ba34aSJunchao Zhang     /* Now copy values to B if needed */
471076ba34aSJunchao Zhang     if (dupOption == MAT_COPY_VALUES) {
472076ba34aSJunchao Zhang       if (akok->a_dual.need_sync_device()) {
473076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host());
474076ba34aSJunchao Zhang         bkok->a_dual.modify_host();
475076ba34aSJunchao Zhang       } else { /* If device has the latest data, we only copy data on device */
476076ba34aSJunchao Zhang         Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device());
477076ba34aSJunchao Zhang         bkok->a_dual.modify_device();
478076ba34aSJunchao Zhang       }
479076ba34aSJunchao Zhang     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
480076ba34aSJunchao Zhang       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
481076ba34aSJunchao Zhang       bkok->a_dual.modify_host();
482076ba34aSJunchao Zhang     }
483076ba34aSJunchao Zhang     mat->spptr = bkok;
484076ba34aSJunchao Zhang   }
485076ba34aSJunchao Zhang 
486*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(mat->defaultvectype));
487*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */
488*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS));
489*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOps_SeqAIJKokkos(mat));
4908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4918c3ff71bSJunchao Zhang }
4928c3ff71bSJunchao Zhang 
4930ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B)
4940ecb592aSJunchao Zhang {
4950ecb592aSJunchao Zhang   Mat               At;
496ff751488SJunchao Zhang   KokkosCsrMatrix   *internT;
4970ecb592aSJunchao Zhang   Mat_SeqAIJKokkos  *atkok,*bkok;
4980ecb592aSJunchao Zhang 
4990ecb592aSJunchao Zhang   PetscFunctionBegin;
500*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */
5010ecb592aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
502ff751488SJunchao Zhang     /* Deep copy internT, as we want to isolate the internal transpose */
503ff751488SJunchao Zhang     CHKERRCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT)));
504*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At));
5050ecb592aSJunchao Zhang     if (reuse == MAT_INITIAL_MATRIX) *B = At;
506*5f80ce2aSJacob Faibussowitsch     else CHKERRQ(MatHeaderReplace(A,&At)); /* Replace A with At inplace */
5070ecb592aSJunchao Zhang   } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
5080ecb592aSJunchao Zhang     if ((*B)->assembled) {
5090ecb592aSJunchao Zhang       bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr);
5100ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values));
511*5f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJKokkosModifyDevice(*B));
5120ecb592aSJunchao Zhang     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
5130ecb592aSJunchao Zhang       Mat_SeqAIJ              *bseq = static_cast<Mat_SeqAIJ*>((*B)->data);
5140ecb592aSJunchao Zhang       MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */
5150ecb592aSJunchao Zhang       MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz());
5160ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(a_h,internT->values));
5170ecb592aSJunchao Zhang       CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries));
5180ecb592aSJunchao Zhang     } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated");
5190ecb592aSJunchao Zhang   }
5200ecb592aSJunchao Zhang   PetscFunctionReturn(0);
5210ecb592aSJunchao Zhang }
5220ecb592aSJunchao Zhang 
5238c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
5248c3ff71bSJunchao Zhang {
52586a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
5268c3ff71bSJunchao Zhang 
5278c3ff71bSJunchao Zhang   PetscFunctionBegin;
52886a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
52986a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
5308c3ff71bSJunchao Zhang     delete aijkok;
53186a27549SJunchao Zhang   } else {
53286a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
53386a27549SJunchao Zhang   }
534cbc6b225SStefano Zampini   A->spptr = NULL;
535*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
536*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
537*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
538*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy_SeqAIJ(A));
5398c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5408c3ff71bSJunchao Zhang }
5418c3ff71bSJunchao Zhang 
5423f3ba80aSJunchao Zhang /*MC
5433f3ba80aSJunchao Zhang    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
5443f3ba80aSJunchao Zhang 
5453f3ba80aSJunchao Zhang    A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types
5463f3ba80aSJunchao Zhang 
5473f3ba80aSJunchao Zhang    Options Database Keys:
5483f3ba80aSJunchao Zhang .  -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions()
5493f3ba80aSJunchao Zhang 
5503f3ba80aSJunchao Zhang   Level: beginner
5513f3ba80aSJunchao Zhang 
5523f3ba80aSJunchao Zhang .seealso: MatCreateSeqAIJKokkos(), MATMPIAIJKOKKOS
5533f3ba80aSJunchao Zhang M*/
55486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
55586a27549SJunchao Zhang {
55686a27549SJunchao Zhang   PetscFunctionBegin;
557*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscKokkosInitializeCheck());
558*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate_SeqAIJ(A));
559*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A));
56086a27549SJunchao Zhang   PetscFunctionReturn(0);
56186a27549SJunchao Zhang }
56286a27549SJunchao Zhang 
563076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
564076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
565a3f881fbSStefano Zampini {
566076ba34aSJunchao Zhang   Mat_SeqAIJ                   *a,*b;
567076ba34aSJunchao Zhang   Mat_SeqAIJKokkos             *akok,*bkok,*ckok;
568076ba34aSJunchao Zhang   MatScalarKokkosView          aa,ba,ca;
569076ba34aSJunchao Zhang   MatRowMapKokkosView          ai,bi,ci;
570076ba34aSJunchao Zhang   MatColIdxKokkosView          aj,bj,cj;
571076ba34aSJunchao Zhang   PetscInt                     m,n,nnz,aN;
572a3f881fbSStefano Zampini 
573a3f881fbSStefano Zampini   PetscFunctionBegin;
574076ba34aSJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
575076ba34aSJunchao Zhang   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
576076ba34aSJunchao Zhang   PetscValidPointer(C,4);
577076ba34aSJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
578076ba34aSJunchao Zhang   PetscCheckTypeName(B,MATSEQAIJKOKKOS);
579*5f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
580*5f80ce2aSJacob Faibussowitsch   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
581076ba34aSJunchao Zhang 
582*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
583*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(B));
584076ba34aSJunchao Zhang   a    = static_cast<Mat_SeqAIJ*>(A->data);
585076ba34aSJunchao Zhang   b    = static_cast<Mat_SeqAIJ*>(B->data);
586076ba34aSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
587076ba34aSJunchao Zhang   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
588076ba34aSJunchao Zhang   aa   = akok->a_dual.view_device();
589076ba34aSJunchao Zhang   ai   = akok->i_dual.view_device();
590076ba34aSJunchao Zhang   ba   = bkok->a_dual.view_device();
591076ba34aSJunchao Zhang   bi   = bkok->i_dual.view_device();
592076ba34aSJunchao Zhang   m    = A->rmap->n; /* M, N and nnz of C */
593076ba34aSJunchao Zhang   n    = A->cmap->n + B->cmap->n;
594076ba34aSJunchao Zhang   nnz  = a->nz + b->nz;
595076ba34aSJunchao Zhang   aN   = A->cmap->n; /* N of A */
596076ba34aSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
597076ba34aSJunchao Zhang     aj = akok->j_dual.view_device();
598076ba34aSJunchao Zhang     bj = bkok->j_dual.view_device();
599076ba34aSJunchao Zhang     auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0));
600076ba34aSJunchao Zhang     auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0));
601076ba34aSJunchao Zhang     auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0));
602076ba34aSJunchao Zhang     ca = ca_dual.view_device();
603076ba34aSJunchao Zhang     ci = ci_dual.view_device();
604076ba34aSJunchao Zhang     cj = cj_dual.view_device();
605076ba34aSJunchao Zhang 
606076ba34aSJunchao Zhang     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
607076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
608076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
609076ba34aSJunchao Zhang       PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
610076ba34aSJunchao Zhang 
611076ba34aSJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
612076ba34aSJunchao Zhang         ci(i) = coffset;
613076ba34aSJunchao Zhang         if (i == m-1) ci(m) = ai(m) + bi(m);
614076ba34aSJunchao Zhang       });
615076ba34aSJunchao Zhang 
616076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
617076ba34aSJunchao Zhang         if (k < alen) {
618076ba34aSJunchao Zhang           ca(coffset+k) = aa(ai(i)+k);
619076ba34aSJunchao Zhang           cj(coffset+k) = aj(ai(i)+k);
620076ba34aSJunchao Zhang         } else {
621076ba34aSJunchao Zhang           ca(coffset+k) = ba(bi(i)+k-alen);
622076ba34aSJunchao Zhang           cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */
623076ba34aSJunchao Zhang         }
624076ba34aSJunchao Zhang       });
625076ba34aSJunchao Zhang     });
626076ba34aSJunchao Zhang     ca_dual.modify_device();
627076ba34aSJunchao Zhang     ci_dual.modify_device();
628076ba34aSJunchao Zhang     cj_dual.modify_device();
629076ba34aSJunchao Zhang     CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual));
630*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C));
631076ba34aSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
632076ba34aSJunchao Zhang     PetscValidHeaderSpecific(*C,MAT_CLASSID,4);
633076ba34aSJunchao Zhang     PetscCheckTypeName(*C,MATSEQAIJKOKKOS);
634076ba34aSJunchao Zhang     ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr);
635076ba34aSJunchao Zhang     ca   = ckok->a_dual.view_device();
636076ba34aSJunchao Zhang     ci   = ckok->i_dual.view_device();
637076ba34aSJunchao Zhang 
638076ba34aSJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
639076ba34aSJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
640076ba34aSJunchao Zhang       PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i);
641076ba34aSJunchao Zhang       Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) {
642076ba34aSJunchao Zhang         if (k < alen) ca(ci(i)+k) = aa(ai(i)+k);
643076ba34aSJunchao Zhang         else          ca(ci(i)+k) = ba(bi(i)+k-alen);
644076ba34aSJunchao Zhang       });
645076ba34aSJunchao Zhang     });
646*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(*C));
647076ba34aSJunchao Zhang   }
648076ba34aSJunchao Zhang   PetscFunctionReturn(0);
649076ba34aSJunchao Zhang }
650076ba34aSJunchao Zhang 
651076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata)
652076ba34aSJunchao Zhang {
653076ba34aSJunchao Zhang   PetscFunctionBegin;
654076ba34aSJunchao Zhang   delete static_cast<MatProductData_SeqAIJKokkos*>(pdata);
655a3f881fbSStefano Zampini   PetscFunctionReturn(0);
656a3f881fbSStefano Zampini }
657a3f881fbSStefano Zampini 
658a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
659a3f881fbSStefano Zampini {
660a3f881fbSStefano Zampini   Mat_Product                    *product = C->product;
661a3f881fbSStefano Zampini   Mat                            A,B;
662076ba34aSJunchao Zhang   bool                           transA,transB; /* use bool, since KK needs this type */
663a3f881fbSStefano Zampini   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
664a3f881fbSStefano Zampini   Mat_SeqAIJ                     *c;
665076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
666076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB;
667a3f881fbSStefano Zampini 
668a3f881fbSStefano Zampini   PetscFunctionBegin;
669a3f881fbSStefano Zampini   MatCheckProduct(C,1);
670*5f80ce2aSJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
671076ba34aSJunchao Zhang   pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data);
672076ba34aSJunchao Zhang 
673076ba34aSJunchao Zhang   if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */
674076ba34aSJunchao Zhang     pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric  */
675076ba34aSJunchao Zhang     PetscFunctionReturn(0);
676076ba34aSJunchao Zhang   }
677076ba34aSJunchao Zhang 
678076ba34aSJunchao Zhang   switch (product->type) {
679076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
680076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
681076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
682076ba34aSJunchao Zhang     default:
68398921bdaSJacob Faibussowitsch       SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
684076ba34aSJunchao Zhang   }
685076ba34aSJunchao Zhang 
686a3f881fbSStefano Zampini   A     = product->A;
687a3f881fbSStefano Zampini   B     = product->B;
688*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
689*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(B));
690a3f881fbSStefano Zampini   akok  = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
691a3f881fbSStefano Zampini   bkok  = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
692a3f881fbSStefano Zampini   ckok  = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
693076ba34aSJunchao Zhang 
694*5f80ce2aSJacob Faibussowitsch   PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty");
695076ba34aSJunchao Zhang 
696076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
697076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
698076ba34aSJunchao Zhang 
699076ba34aSJunchao Zhang   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
700076ba34aSJunchao Zhang   if (transA) {
701*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
702076ba34aSJunchao Zhang     transA = false;
703a3f881fbSStefano Zampini   }
704a3f881fbSStefano Zampini 
705076ba34aSJunchao Zhang   if (transB) {
706*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
707076ba34aSJunchao Zhang     transB = false;
708076ba34aSJunchao Zhang   }
709*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
710076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat));
711076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */
712*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
713*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosModifyDevice(C));
714a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
715a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
716*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
717*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
718*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
719a3f881fbSStefano Zampini   c->reallocs         = 0;
720076ba34aSJunchao Zhang   C->info.mallocs     = 0;
721a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
722a3f881fbSStefano Zampini   C->assembled        = C->was_assembled = PETSC_TRUE;
723a3f881fbSStefano Zampini   C->num_ass++;
724a3f881fbSStefano Zampini   PetscFunctionReturn(0);
725a3f881fbSStefano Zampini }
726a3f881fbSStefano Zampini 
727a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
728a3f881fbSStefano Zampini {
729076ba34aSJunchao Zhang   Mat_Product                    *product = C->product;
730076ba34aSJunchao Zhang   MatProductType                 ptype;
731076ba34aSJunchao Zhang   Mat                            A,B;
732076ba34aSJunchao Zhang   bool                           transA,transB;
733076ba34aSJunchao Zhang   Mat_SeqAIJKokkos               *akok,*bkok,*ckok;
734076ba34aSJunchao Zhang   MatProductData_SeqAIJKokkos    *pdata;
735076ba34aSJunchao Zhang   MPI_Comm                       comm;
736076ba34aSJunchao Zhang   KokkosCsrMatrix                *csrmatA,*csrmatB,csrmatC;
737a3f881fbSStefano Zampini 
738a3f881fbSStefano Zampini   PetscFunctionBegin;
739a3f881fbSStefano Zampini   MatCheckProduct(C,1);
740*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm));
741*5f80ce2aSJacob Faibussowitsch   PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty");
742a3f881fbSStefano Zampini   A       = product->A;
743a3f881fbSStefano Zampini   B       = product->B;
744*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
745*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(B));
746a3f881fbSStefano Zampini   akok    = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
747a3f881fbSStefano Zampini   bkok    = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
748076ba34aSJunchao Zhang   csrmatA = &akok->csrmat;
749076ba34aSJunchao Zhang   csrmatB = &bkok->csrmat;
750076ba34aSJunchao Zhang 
751a3f881fbSStefano Zampini   ptype   = product->type;
752a3f881fbSStefano Zampini   switch (ptype) {
753076ba34aSJunchao Zhang     case MATPRODUCT_AB:  transA = false; transB = false; break;
754076ba34aSJunchao Zhang     case MATPRODUCT_AtB: transA = true;  transB = false; break;
755076ba34aSJunchao Zhang     case MATPRODUCT_ABt: transA = false; transB = true;  break;
756a3f881fbSStefano Zampini     default:
75798921bdaSJacob Faibussowitsch       SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
758a3f881fbSStefano Zampini   }
759a3f881fbSStefano Zampini 
760076ba34aSJunchao Zhang   product->data = pdata = new MatProductData_SeqAIJKokkos();
761076ba34aSJunchao Zhang   pdata->kh.set_team_work_size(16);
762076ba34aSJunchao Zhang   pdata->kh.set_dynamic_scheduling(true);
763076ba34aSJunchao Zhang   pdata->reusesym = product->api_user;
764a3f881fbSStefano Zampini 
765076ba34aSJunchao Zhang   /* TODO: add command line options to select spgemm algorithms */
766076ba34aSJunchao Zhang   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
7676ffb9508SJunchao Zhang   /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API.
7686ffb9508SJunchao Zhang      We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support.
7696ffb9508SJunchao Zhang    */
770076ba34aSJunchao Zhang   pdata->kh.create_spgemm_handle(spgemm_alg);
771076ba34aSJunchao Zhang 
772*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
773076ba34aSJunchao Zhang   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
774076ba34aSJunchao Zhang   if (transA) {
775*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA));
776076ba34aSJunchao Zhang     transA = false;
777076ba34aSJunchao Zhang   }
778076ba34aSJunchao Zhang 
779076ba34aSJunchao Zhang   if (transB) {
780*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB));
781076ba34aSJunchao Zhang     transB = false;
782076ba34aSJunchao Zhang   }
783076ba34aSJunchao Zhang 
784076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
785076ba34aSJunchao Zhang   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
786076ba34aSJunchao Zhang     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
787076ba34aSJunchao Zhang     calling new Mat_SeqAIJKokkos().
788076ba34aSJunchao Zhang     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
789076ba34aSJunchao Zhang   */
790076ba34aSJunchao Zhang   CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC));
791076ba34aSJunchao Zhang   CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC));
792*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
793076ba34aSJunchao Zhang 
794076ba34aSJunchao Zhang   CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
795*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok));
796076ba34aSJunchao Zhang   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
797a3f881fbSStefano Zampini   PetscFunctionReturn(0);
798a3f881fbSStefano Zampini }
799a3f881fbSStefano Zampini 
800a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
801076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
802a3f881fbSStefano Zampini {
803076ba34aSJunchao Zhang   Mat_Product    *product = mat->product;
804a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
805a3f881fbSStefano Zampini 
806a3f881fbSStefano Zampini   PetscFunctionBegin;
807a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
808*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok));
809a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
810*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok));
811a3f881fbSStefano Zampini   }
812a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
813a3f881fbSStefano Zampini     switch (product->type) {
814a3f881fbSStefano Zampini     case MATPRODUCT_AB:
815a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
816a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
817a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
818a3f881fbSStefano Zampini       break;
819a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
820a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
821a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
822a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
823a3f881fbSStefano Zampini       break;
824a3f881fbSStefano Zampini     default:
825a3f881fbSStefano Zampini       break;
826a3f881fbSStefano Zampini     }
827a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
828*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat));
829a3f881fbSStefano Zampini   }
830a3f881fbSStefano Zampini   PetscFunctionReturn(0);
831a3f881fbSStefano Zampini }
832a587d139SMark 
833f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
834f0cf5187SStefano Zampini {
835f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
836f0cf5187SStefano Zampini 
837f0cf5187SStefano Zampini   PetscFunctionBegin;
838*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
839*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
840f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
841076ba34aSJunchao Zhang   KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device());
842*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosModifyDevice(A));
843*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
844*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
845f0cf5187SStefano Zampini   PetscFunctionReturn(0);
846f0cf5187SStefano Zampini }
847f0cf5187SStefano Zampini 
848a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
849a587d139SMark {
850076ba34aSJunchao Zhang   Mat_SeqAIJKokkos *aijkok;
851a587d139SMark 
852a587d139SMark   PetscFunctionBegin;
853076ba34aSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
8542328674fSJunchao Zhang   if (aijkok) { /* Only zero the device if data is already there */
855076ba34aSJunchao Zhang     KokkosBlas::fill(aijkok->a_dual.view_device(),0.0);
856*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(A));
8572328674fSJunchao Zhang   } else { /* Might be preallocated but not assembled */
858*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatZeroEntries_SeqAIJ(A));
8592328674fSJunchao Zhang   }
860a587d139SMark   PetscFunctionReturn(0);
861a587d139SMark }
862a587d139SMark 
863db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
86442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv)
865db78de30SJunchao Zhang {
866db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
867db78de30SJunchao Zhang 
868db78de30SJunchao Zhang   PetscFunctionBegin;
869db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
870db78de30SJunchao Zhang   PetscValidPointer(kv,2);
871db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
872*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
873db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
874076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
875db78de30SJunchao Zhang   PetscFunctionReturn(0);
876db78de30SJunchao Zhang }
877db78de30SJunchao Zhang 
87842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv)
879db78de30SJunchao Zhang {
880db78de30SJunchao Zhang   PetscFunctionBegin;
881db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
882db78de30SJunchao Zhang   PetscValidPointer(kv,2);
883db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
884db78de30SJunchao Zhang   PetscFunctionReturn(0);
885db78de30SJunchao Zhang }
886db78de30SJunchao Zhang 
88742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv)
888db78de30SJunchao Zhang {
889db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
890db78de30SJunchao Zhang 
891db78de30SJunchao Zhang   PetscFunctionBegin;
892db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
893db78de30SJunchao Zhang   PetscValidPointer(kv,2);
894db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
895*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
896db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
897076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
898db78de30SJunchao Zhang   PetscFunctionReturn(0);
899db78de30SJunchao Zhang }
900db78de30SJunchao Zhang 
90142550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv)
902db78de30SJunchao Zhang {
903db78de30SJunchao Zhang   PetscFunctionBegin;
904db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
905db78de30SJunchao Zhang   PetscValidPointer(kv,2);
906db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
907*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosModifyDevice(A));
908db78de30SJunchao Zhang   PetscFunctionReturn(0);
909db78de30SJunchao Zhang }
910db78de30SJunchao Zhang 
91142550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
912db78de30SJunchao Zhang {
913db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
914db78de30SJunchao Zhang 
915db78de30SJunchao Zhang   PetscFunctionBegin;
916db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
917db78de30SJunchao Zhang   PetscValidPointer(kv,2);
918db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
919db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
920076ba34aSJunchao Zhang   *kv    = aijkok->a_dual.view_device();
921db78de30SJunchao Zhang   PetscFunctionReturn(0);
922db78de30SJunchao Zhang }
923db78de30SJunchao Zhang 
92442550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv)
925db78de30SJunchao Zhang {
926db78de30SJunchao Zhang   PetscFunctionBegin;
927db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
928db78de30SJunchao Zhang   PetscValidPointer(kv,2);
929db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
930*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosModifyDevice(A));
931db78de30SJunchao Zhang   PetscFunctionReturn(0);
932db78de30SJunchao Zhang }
933db78de30SJunchao Zhang 
934c17cf699SJunchao Zhang /* Computes Y += alpha X */
935c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)
936a587d139SMark {
937a587d139SMark   Mat_SeqAIJ                 *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
938c17cf699SJunchao Zhang   Mat_SeqAIJKokkos           *xkok,*ykok,*zkok;
939c17cf699SJunchao Zhang   ConstMatScalarKokkosView   Xa;
940c17cf699SJunchao Zhang   MatScalarKokkosView        Ya;
941a587d139SMark 
942a587d139SMark   PetscFunctionBegin;
943c17cf699SJunchao Zhang   PetscCheckTypeName(Y,MATSEQAIJKOKKOS);
944c17cf699SJunchao Zhang   PetscCheckTypeName(X,MATSEQAIJKOKKOS);
945*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(Y));
946*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(X));
947*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
948db78de30SJunchao Zhang 
949c17cf699SJunchao Zhang   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
950c17cf699SJunchao Zhang     /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */
951a587d139SMark     PetscBool e;
952*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e));
953a587d139SMark     if (e) {
954*5f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscArraycmp(x->j,y->j,y->nz,&e));
955c17cf699SJunchao Zhang       if (e) pattern = SAME_NONZERO_PATTERN;
956a587d139SMark     }
957a587d139SMark   }
958db78de30SJunchao Zhang 
959c17cf699SJunchao Zhang   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
960c17cf699SJunchao Zhang     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
961c17cf699SJunchao Zhang     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
962c17cf699SJunchao Zhang     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
963c17cf699SJunchao Zhang   */
964c17cf699SJunchao Zhang   ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
965c17cf699SJunchao Zhang   xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
966c17cf699SJunchao Zhang   Xa   = xkok->a_dual.view_device();
967c17cf699SJunchao Zhang   Ya   = ykok->a_dual.view_device();
968c17cf699SJunchao Zhang 
969c17cf699SJunchao Zhang   if (pattern == SAME_NONZERO_PATTERN) {
970c17cf699SJunchao Zhang     KokkosBlas::axpy(alpha,Xa,Ya);
971*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(Y));
972c17cf699SJunchao Zhang   } else if (pattern == SUBSET_NONZERO_PATTERN) {
973c17cf699SJunchao Zhang     MatRowMapKokkosView  Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device();
974c17cf699SJunchao Zhang     MatColIdxKokkosView  Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device();
975c17cf699SJunchao Zhang 
976c17cf699SJunchao Zhang     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) {
977c17cf699SJunchao Zhang       PetscInt i = t.league_rank(); /* row i */
978c17cf699SJunchao Zhang       Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */
979c17cf699SJunchao Zhang         PetscInt p,q = Yi(i);
980c17cf699SJunchao Zhang         for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */
981c17cf699SJunchao Zhang           while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */
982c17cf699SJunchao Zhang           if (Xj(p) == Yj(q)) { /* Found it */
983c17cf699SJunchao Zhang             Ya(q) += alpha * Xa(p);
984c17cf699SJunchao Zhang             q++;
985a587d139SMark           } else {
986c17cf699SJunchao Zhang             /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
987c17cf699SJunchao Zhang                Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
988c17cf699SJunchao Zhang             */
989c17cf699SJunchao Zhang             if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */
990a587d139SMark           }
991c17cf699SJunchao Zhang         }
992c17cf699SJunchao Zhang       });
993c17cf699SJunchao Zhang     });
994*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJKokkosModifyDevice(Y));
995c17cf699SJunchao Zhang   } else { /* different nonzero patterns */
996c17cf699SJunchao Zhang     Mat             Z;
997c17cf699SJunchao Zhang     KokkosCsrMatrix zcsr;
998c17cf699SJunchao Zhang     KernelHandle    kh;
999c17cf699SJunchao Zhang     kh.create_spadd_handle(false);
1000c17cf699SJunchao Zhang     KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr);
1001c17cf699SJunchao Zhang     KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr);
1002c17cf699SJunchao Zhang     zkok = new Mat_SeqAIJKokkos(zcsr);
1003*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z));
1004*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatHeaderReplace(Y,&Z));
1005c17cf699SJunchao Zhang     kh.destroy_spadd_handle();
1006c17cf699SJunchao Zhang   }
1007*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
1008*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */
1009a587d139SMark   PetscFunctionReturn(0);
1010a587d139SMark }
1011a587d139SMark 
1012394ed5ebSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
101342550becSJunchao Zhang {
101442550becSJunchao Zhang   Mat_SeqAIJKokkos *akok;
101542550becSJunchao Zhang   Mat_SeqAIJ       *aseq;
101642550becSJunchao Zhang 
101742550becSJunchao Zhang   PetscFunctionBegin;
1018*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
1019394ed5ebSJunchao Zhang   aseq = static_cast<Mat_SeqAIJ*>(mat->data);
102042550becSJunchao Zhang   akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr);
1021cbc6b225SStefano Zampini   delete akok;
1022cbc6b225SStefano Zampini   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE);
1023*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries_SeqAIJKokkos(mat));
1024394ed5ebSJunchao Zhang   akok->SetUpCOO(aseq);
102542550becSJunchao Zhang   PetscFunctionReturn(0);
102642550becSJunchao Zhang }
102742550becSJunchao Zhang 
102842550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)
102942550becSJunchao Zhang {
103042550becSJunchao Zhang   Mat_SeqAIJ                  *aseq = static_cast<Mat_SeqAIJ*>(A->data);
103142550becSJunchao Zhang   Mat_SeqAIJKokkos            *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1032394ed5ebSJunchao Zhang   PetscCount                  Annz = aseq->nz;
1033394ed5ebSJunchao Zhang   const PetscCountKokkosView& jmap = akok->jmap_d;
1034394ed5ebSJunchao Zhang   const PetscCountKokkosView& perm = akok->perm_d;
103542550becSJunchao Zhang   MatScalarKokkosView         Aa;
103642550becSJunchao Zhang   ConstMatScalarKokkosView    kv;
103742550becSJunchao Zhang   PetscMemType                memtype;
103842550becSJunchao Zhang 
103942550becSJunchao Zhang   PetscFunctionBegin;
1040*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscGetMemType(v,&memtype));
104142550becSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1042394ed5ebSJunchao Zhang     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n));
104342550becSJunchao Zhang   } else {
1044394ed5ebSJunchao Zhang     kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */
104542550becSJunchao Zhang   }
104642550becSJunchao Zhang 
1047cbc6b225SStefano Zampini   if (imode == INSERT_VALUES) {
1048*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */
1049cbc6b225SStefano Zampini     Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */
1050*5f80ce2aSJacob Faibussowitsch   } else CHKERRQ(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */
105142550becSJunchao Zhang 
1052cbc6b225SStefano Zampini   Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) {for (PetscCount k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k));});
1053394ed5ebSJunchao Zhang 
1054*5f80ce2aSJacob Faibussowitsch   if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJRestoreKokkosViewWrite(A,&Aa));
1055*5f80ce2aSJacob Faibussowitsch   else CHKERRQ(MatSeqAIJRestoreKokkosView(A,&Aa));
105642550becSJunchao Zhang   PetscFunctionReturn(0);
105742550becSJunchao Zhang }
105842550becSJunchao Zhang 
105986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
10608f7e8f9dSMark Adams {
10618f7e8f9dSMark Adams   PetscFunctionBegin;
1062*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncHost(A));
1063*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info));
10648f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
10658f7e8f9dSMark Adams   PetscFunctionReturn(0);
10668f7e8f9dSMark Adams }
10678f7e8f9dSMark Adams 
10688c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
10698c3ff71bSJunchao Zhang {
1070076ba34aSJunchao Zhang   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1071076ba34aSJunchao Zhang 
10728c3ff71bSJunchao Zhang   PetscFunctionBegin;
1073076ba34aSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
10746f3d89d0SStefano Zampini   A->boundtocpu  = PETSC_FALSE;
10756f3d89d0SStefano Zampini 
10768c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
10778c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
10788c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1079a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1080f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
1081a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1082076ba34aSJunchao Zhang   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
10838c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
10848c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
10858c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
10868c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
10878c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
10888c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1089076ba34aSJunchao Zhang   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
10900ecb592aSJunchao Zhang   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1091152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1092076ba34aSJunchao Zhang   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1093076ba34aSJunchao Zhang   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1094076ba34aSJunchao Zhang   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1095076ba34aSJunchao Zhang   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1096076ba34aSJunchao Zhang   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1097076ba34aSJunchao Zhang   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
109842550becSJunchao Zhang 
1099*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos));
1100*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos));
1101076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1102076ba34aSJunchao Zhang }
1103076ba34aSJunchao Zhang 
1104076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok)
1105076ba34aSJunchao Zhang {
1106076ba34aSJunchao Zhang   Mat_SeqAIJ *aseq;
1107076ba34aSJunchao Zhang   PetscInt    i,m,n;
1108076ba34aSJunchao Zhang 
1109076ba34aSJunchao Zhang   PetscFunctionBegin;
1110*5f80ce2aSJacob Faibussowitsch   PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty");
1111076ba34aSJunchao Zhang 
1112076ba34aSJunchao Zhang   m = akok->nrows();
1113076ba34aSJunchao Zhang   n = akok->ncols();
1114*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(A,m,n,m,n));
1115*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(A,MATSEQAIJKOKKOS));
1116076ba34aSJunchao Zhang 
1117076ba34aSJunchao Zhang   /* Set up data structures of A as a MATSEQAIJ */
1118*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL));
1119076ba34aSJunchao Zhang   aseq = (Mat_SeqAIJ*)(A)->data;
1120076ba34aSJunchao Zhang 
1121076ba34aSJunchao Zhang   akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */
1122076ba34aSJunchao Zhang   akok->j_dual.sync_host();
1123076ba34aSJunchao Zhang 
1124076ba34aSJunchao Zhang   aseq->i            = akok->i_host_data();
1125076ba34aSJunchao Zhang   aseq->j            = akok->j_host_data();
1126076ba34aSJunchao Zhang   aseq->a            = akok->a_host_data();
1127076ba34aSJunchao Zhang   aseq->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1128076ba34aSJunchao Zhang   aseq->singlemalloc = PETSC_FALSE;
1129076ba34aSJunchao Zhang   aseq->free_a       = PETSC_FALSE;
1130076ba34aSJunchao Zhang   aseq->free_ij      = PETSC_FALSE;
1131076ba34aSJunchao Zhang   aseq->nz           = akok->nnz();
1132076ba34aSJunchao Zhang   aseq->maxnz        = aseq->nz;
1133076ba34aSJunchao Zhang 
1134*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(m,&aseq->imax));
1135*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(m,&aseq->ilen));
1136076ba34aSJunchao Zhang   for (i=0; i<m; i++) {
1137076ba34aSJunchao Zhang     aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i];
1138076ba34aSJunchao Zhang   }
1139076ba34aSJunchao Zhang 
1140076ba34aSJunchao Zhang   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1141076ba34aSJunchao Zhang   akok->nonzerostate = A->nonzerostate;
1142ff751488SJunchao Zhang   A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1143*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
1144*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
1145076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1146076ba34aSJunchao Zhang }
1147076ba34aSJunchao Zhang 
1148076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1149076ba34aSJunchao Zhang 
1150076ba34aSJunchao Zhang    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1151076ba34aSJunchao Zhang  */
1152076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode  MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A)
1153076ba34aSJunchao Zhang {
1154076ba34aSJunchao Zhang   PetscFunctionBegin;
1155*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(comm,A));
1156*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok));
11578c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
11588c3ff71bSJunchao Zhang }
11598c3ff71bSJunchao Zhang 
11608c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
1161152b3e56SJunchao Zhang /*@C
11628c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
11638c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
11648c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
11658c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
11668c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
11678c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
11688c3ff71bSJunchao Zhang 
11698c3ff71bSJunchao Zhang    Collective
11708c3ff71bSJunchao Zhang 
11718c3ff71bSJunchao Zhang    Input Parameters:
11728c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
11738c3ff71bSJunchao Zhang .  m - number of rows
11748c3ff71bSJunchao Zhang .  n - number of columns
11758c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
11768c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
11778c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
11788c3ff71bSJunchao Zhang 
11798c3ff71bSJunchao Zhang    Output Parameter:
11808c3ff71bSJunchao Zhang .  A - the matrix
11818c3ff71bSJunchao Zhang 
11828c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
11838c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
11848c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
11858c3ff71bSJunchao Zhang 
11868c3ff71bSJunchao Zhang    Notes:
11878c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
11888c3ff71bSJunchao Zhang 
11898c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
11908c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
11918c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
11928c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
11938c3ff71bSJunchao Zhang 
11948c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
11958c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
11968c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
11978c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
11988c3ff71bSJunchao Zhang 
11998c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
12008c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
12018c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
12028c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
12038c3ff71bSJunchao Zhang 
12048c3ff71bSJunchao Zhang    Level: intermediate
12058c3ff71bSJunchao Zhang 
1206076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
12078c3ff71bSJunchao Zhang @*/
12088c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
12098c3ff71bSJunchao Zhang {
12108c3ff71bSJunchao Zhang   PetscFunctionBegin;
1211*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscKokkosInitializeCheck());
1212*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(comm,A));
1213*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*A,m,n,m,n));
1214*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(*A,MATSEQAIJKOKKOS));
1215*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
12168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
12178c3ff71bSJunchao Zhang }
1218930e68a5SMark Adams 
12198f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
12208f7e8f9dSMark Adams //
122146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used).
12228f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
12238f7e8f9dSMark Adams //
12248f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
1225930e68a5SMark Adams {
12268f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
12278f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12288f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
12298f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
1230300d22a6SJunchao Zhang   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data();
1231076ba34aSJunchao Zhang   const PetscScalar  *aa_d = aijkok->a_dual.view_device().data();
1232076ba34aSJunchao Zhang   PetscScalar        *ba_d = baijkok->a_dual.view_device().data();
12338f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
123446804e07SMark Adams   PetscInt           nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used
1235930e68a5SMark Adams 
1236930e68a5SMark Adams   PetscFunctionBegin;
12372c71b3e2SJacob Faibussowitsch   PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n);
1238*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity));
12392c71b3e2SJacob Faibussowitsch   PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
1240*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(isrow,&r_h));
1241*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetIndices(isicol,&ic_h));
1242*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGetSize(isicol,&nc));
1243*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1244*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
12458f7e8f9dSMark Adams   {
12468f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
12478f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
12488f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
12498f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
12508f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
12518f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
12528f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
12538f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
12548f7e8f9dSMark Adams     size_t flops_h = 0.0;
12558f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
12568f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
12578f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
12588f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
12598f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
12608f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
12618f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
12628f7e8f9dSMark Adams     // Fill A --> fact
12638f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
1264042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA
12658f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
12668f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
12678f7e8f9dSMark Adams         // zero rows of B
12688f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12698f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
12708f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
12718f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
12728f7e8f9dSMark Adams             /* zero (unfactored row) */
12738f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
12748f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
12758f7e8f9dSMark Adams           });
12768f7e8f9dSMark Adams         // copy A into B
12778f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
12788f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
12798f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
12808f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
12818f7e8f9dSMark Adams             /* load in initial (unfactored row) */
12828f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
12838f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
12848f7e8f9dSMark Adams               PetscScalar vala = av[j];
12858f7e8f9dSMark Adams               if (colb == rowb) {
12868f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
12878f7e8f9dSMark Adams               } else {
12888f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12898f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
12908f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
12918f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
12928f7e8f9dSMark Adams                   if (pbj[j] == colb) {
12938f7e8f9dSMark Adams                     pba[j] = vala;
12948f7e8f9dSMark Adams                     set++;
12958f7e8f9dSMark Adams                     break;
12968f7e8f9dSMark Adams                   }
12978f7e8f9dSMark Adams                 }
12988f1da0b2SJunchao Zhang                #if !defined(PETSC_HAVE_SYCL)
12998f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
13008f1da0b2SJunchao Zhang                #endif
13018f7e8f9dSMark Adams               }
13028f7e8f9dSMark Adams             }
13038f7e8f9dSMark Adams           });
13048f7e8f9dSMark Adams       });
13058f7e8f9dSMark Adams     Kokkos::fence();
1306930e68a5SMark Adams 
13078f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
13088f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13098f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
13108f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
1311042217e8SBarry Smith         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA
13128f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
13138f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
13148f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
13158f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
13168f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
13178f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
13188f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
13198f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
13208f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
13218f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
13228f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
13238f7e8f9dSMark Adams               else {
13248f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
13258f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
13268f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
13278f7e8f9dSMark Adams                 size_t         st_idx;
13288f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
13298f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
13308f7e8f9dSMark Adams                 // get column, there has got to be a better way
13318f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
13328f7e8f9dSMark Adams                     if (pjL[j] == ii) {
13338f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
13348f7e8f9dSMark Adams                       idx = j; // output
13358f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
13368f7e8f9dSMark Adams                     }
13378f7e8f9dSMark Adams                 }, st_idx);
13388f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
13398f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
134099551766SMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register
134199551766SMark Adams #endif
134299551766SMark Adams                 // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
13438f7e8f9dSMark Adams                 // U(i+1,:end)
13448f7e8f9dSMark Adams                 Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
13458f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
13468f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
13478f7e8f9dSMark Adams                       if (col==myk) {
13488f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
13498f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
13508f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
13518f7e8f9dSMark Adams                       } else {
13528f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
13538f7e8f9dSMark Adams                         PetscInt       high, low;
13548f7e8f9dSMark Adams                         const PetscInt *startj;
13558f7e8f9dSMark Adams                         if (col<myk) { // L
13568f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
13578f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
13588f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
13598f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
13608f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
13618f7e8f9dSMark Adams                         } else {
13628f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
13638f7e8f9dSMark Adams                           start = ba_d + idx;
13648f7e8f9dSMark Adams                           startj= bj_d + idx;
13658f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
13668f7e8f9dSMark Adams                         }
13678f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
13688f7e8f9dSMark Adams                         low  = 0;
13698f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
13708f7e8f9dSMark Adams                         while (high-low > 5) {
13718f7e8f9dSMark Adams                           int t = (low+high)/2;
13728f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
13738f7e8f9dSMark Adams                           else                 low  = t;
13748f7e8f9dSMark Adams                         }
13758f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
13768f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
13778f7e8f9dSMark Adams                         }
13788f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL)
137999551766SMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register
138099551766SMark Adams #endif
13818f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
13828f7e8f9dSMark Adams                       }
13838f7e8f9dSMark Adams                     });
13848f7e8f9dSMark Adams               }
13858f7e8f9dSMark Adams             });
13868f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
13878f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
13888f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
13898f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
13908f7e8f9dSMark Adams       });
13918f7e8f9dSMark Adams     Kokkos::fence();
13928f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
1393*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops((PetscLogDouble)h_flops_k()));
13948f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
13958f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
13968f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
13978f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
13988f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
13998f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
14008f7e8f9dSMark Adams             if (i < end) {
14018f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
14028f7e8f9dSMark Adams               *pv = 1.0/(*pv);
14038f7e8f9dSMark Adams             }
14048f7e8f9dSMark Adams           });
14058f7e8f9dSMark Adams       });
14068f7e8f9dSMark Adams   }
1407*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
1408*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(isicol,&ic_h));
1409*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISRestoreIndices(isrow,&r_h));
14108f7e8f9dSMark Adams 
1411*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(isrow,&row_identity));
1412*5f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(isicol,&col_identity));
14138f7e8f9dSMark Adams   if (b->inode.size) {
14148f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
14158f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
14168f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
14178f7e8f9dSMark Adams   } else {
14188f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
14198f7e8f9dSMark Adams   }
14208f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
1421*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncHost(B)); // solve on CPU
14228f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
14238f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
14248f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
14258f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
14268f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
14278f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
14288f7e8f9dSMark Adams 
1429930e68a5SMark Adams   PetscFunctionReturn(0);
1430930e68a5SMark Adams }
1431930e68a5SMark Adams 
143286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1433930e68a5SMark Adams {
1434930e68a5SMark Adams   PetscFunctionBegin;
1435*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
143686a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
143786a27549SJunchao Zhang   PetscFunctionReturn(0);
143886a27549SJunchao Zhang }
143986a27549SJunchao Zhang 
144086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
144186a27549SJunchao Zhang {
144286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
144386a27549SJunchao Zhang 
144486a27549SJunchao Zhang   PetscFunctionBegin;
144586a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
144686a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
144786a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
144886a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
144986a27549SJunchao Zhang   }
145086a27549SJunchao Zhang   PetscFunctionReturn(0);
145186a27549SJunchao Zhang }
145286a27549SJunchao Zhang 
145386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
145486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
145586a27549SJunchao Zhang {
145686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1457076ba34aSJunchao Zhang   MatColIdxType             n = A->rmap->n;
145886a27549SJunchao Zhang 
145986a27549SJunchao Zhang   PetscFunctionBegin;
146086a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
146186a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1462076ba34aSJunchao Zhang     factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1);
146386a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1464076ba34aSJunchao Zhang     factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1465076ba34aSJunchao Zhang     factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0));
146686a27549SJunchao Zhang 
146786a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1468076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1469076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1470076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
147186a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
147286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
147386a27549SJunchao Zhang 
147486a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
147586a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
147686a27549SJunchao Zhang     */
1477076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1478076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
147986a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
148086a27549SJunchao Zhang 
148186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
148286a27549SJunchao Zhang 
148386a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1484076ba34aSJunchao Zhang     factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1);
148586a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1486076ba34aSJunchao Zhang     factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1487076ba34aSJunchao Zhang     factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0));
148886a27549SJunchao Zhang 
148986a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1490076ba34aSJunchao Zhang       ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView,
1491076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView,
1492076ba34aSJunchao Zhang       MatRowMapKokkosView,DefaultExecutionSpace>(
149386a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
149486a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
149586a27549SJunchao Zhang 
149686a27549SJunchao Zhang     /* Sort indices. See comments above */
1497076ba34aSJunchao Zhang     KokkosKernels::sort_crs_matrix<DefaultExecutionSpace,
1498076ba34aSJunchao Zhang       MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>(
149986a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
150086a27549SJunchao Zhang 
150186a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
150286a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
150386a27549SJunchao Zhang   }
150486a27549SJunchao Zhang   PetscFunctionReturn(0);
150586a27549SJunchao Zhang }
150686a27549SJunchao Zhang 
150786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
150886a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
150986a27549SJunchao Zhang {
151086a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
151186a27549SJunchao Zhang   PetscScalarKokkosView          xv;
151286a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
151386a27549SJunchao Zhang 
151486a27549SJunchao Zhang   PetscFunctionBegin;
1515*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1516*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSymbolicSolveCheck(A));
1517*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(b,&bv));
1518*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(x,&xv));
151986a27549SJunchao Zhang   /* Solve L tmpv = b */
15203f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector));
152186a27549SJunchao Zhang   /* Solve Ux = tmpv */
15223f520e80SJunchao Zhang   CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv));
1523*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(b,&bv));
1524*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(x,&xv));
1525*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
152686a27549SJunchao Zhang   PetscFunctionReturn(0);
152786a27549SJunchao Zhang }
152886a27549SJunchao Zhang 
1529076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */
153086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
153186a27549SJunchao Zhang {
153286a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
153386a27549SJunchao Zhang   PetscScalarKokkosView          xv;
153486a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
153586a27549SJunchao Zhang 
153686a27549SJunchao Zhang   PetscFunctionBegin;
1537*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1538*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosTransposeSolveCheck(A));
1539*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosView(b,&bv));
1540*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetKokkosViewWrite(x,&xv));
154186a27549SJunchao Zhang   /* Solve U^T tmpv = b */
154286a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
154386a27549SJunchao Zhang 
154486a27549SJunchao Zhang   /* Solve L^T x = tmpv */
154586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1546*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosView(b,&bv));
1547*5f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreKokkosViewWrite(x,&xv));
1548*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
154986a27549SJunchao Zhang   PetscFunctionReturn(0);
155086a27549SJunchao Zhang }
155186a27549SJunchao Zhang 
155286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
155386a27549SJunchao Zhang {
155486a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
155586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
155686a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
155786a27549SJunchao Zhang 
155886a27549SJunchao Zhang   PetscFunctionBegin;
1559*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1560*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
1561076ba34aSJunchao Zhang 
1562076ba34aSJunchao Zhang   auto a_d = aijkok->a_dual.view_device();
1563076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1564076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1565076ba34aSJunchao Zhang 
1566076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
156786a27549SJunchao Zhang 
156886a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
156986a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
157086a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
157186a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
157286a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
157386a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
157486a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
157586a27549SJunchao Zhang 
157686a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
157786a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
157886a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1579eeadb341SJunchao Zhang   /* TODO: log flops, but how to know that? */
1580*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
158186a27549SJunchao Zhang   PetscFunctionReturn(0);
158286a27549SJunchao Zhang }
158386a27549SJunchao Zhang 
158486a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
158586a27549SJunchao Zhang {
158686a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
158786a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
158886a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
158986a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
159086a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
159186a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
159286a27549SJunchao Zhang 
159386a27549SJunchao Zhang   PetscFunctionBegin;
1594*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A));
159586a27549SJunchao Zhang   /* Rebuild factors */
159686a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
159786a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
159886a27549SJunchao Zhang 
159986a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
160086a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
160186a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
160286a27549SJunchao Zhang 
160386a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
160486a27549SJunchao Zhang 
160586a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
160686a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
160786a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
160886a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
160986a27549SJunchao Zhang 
161086a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1611076ba34aSJunchao Zhang   auto i_d = aijkok->i_dual.view_device();
1612076ba34aSJunchao Zhang   auto j_d = aijkok->j_dual.view_device();
1613076ba34aSJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
161486a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
161586a27549SJunchao Zhang 
161686a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
161786a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
161886a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
161986a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
162086a27549SJunchao Zhang 
162186a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
162286a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
162386a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
162486a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
162586a27549SJunchao Zhang  #else
162686a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
162786a27549SJunchao Zhang  #endif
162886a27549SJunchao Zhang 
162986a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
163086a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
163186a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
163286a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
163386a27549SJunchao Zhang 
163486a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
1635*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL));
163686a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
163786a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
163886a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
163986a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
164086a27549SJunchao Zhang 
164186a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
164286a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1643930e68a5SMark Adams   PetscFunctionReturn(0);
1644930e68a5SMark Adams }
1645930e68a5SMark Adams 
16468f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
16478f7e8f9dSMark Adams {
16488f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
16498f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1650930e68a5SMark Adams 
16518f7e8f9dSMark Adams   PetscFunctionBegin;
1652*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
16538f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
16548f7e8f9dSMark Adams   // move B data into Kokkos
1655*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(B)); // create aijkok
1656*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJKokkosSyncDevice(A)); // create aijkok
16578f7e8f9dSMark Adams   {
16588f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
1659300d22a6SJunchao Zhang     if (!baijkok->diag_d.extent(0)) {
16608f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1661300d22a6SJunchao Zhang       baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
1662300d22a6SJunchao Zhang       Kokkos::deep_copy (baijkok->diag_d, h_diag);
16638f7e8f9dSMark Adams     }
16648f7e8f9dSMark Adams   }
16658f7e8f9dSMark Adams   PetscFunctionReturn(0);
16668f7e8f9dSMark Adams }
16678f7e8f9dSMark Adams 
166886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1669930e68a5SMark Adams {
1670930e68a5SMark Adams   PetscFunctionBegin;
1671930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1672930e68a5SMark Adams   PetscFunctionReturn(0);
1673930e68a5SMark Adams }
1674930e68a5SMark Adams 
16758f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
16768f7e8f9dSMark Adams {
16778f7e8f9dSMark Adams   PetscFunctionBegin;
16788f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
16798f7e8f9dSMark Adams   PetscFunctionReturn(0);
16808f7e8f9dSMark Adams }
16818f7e8f9dSMark Adams 
1682930e68a5SMark Adams /*MC
168386a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
168486a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1685930e68a5SMark Adams 
1686930e68a5SMark Adams   Level: beginner
1687930e68a5SMark Adams 
16883f3ba80aSJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1689930e68a5SMark Adams M*/
169086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1691930e68a5SMark Adams {
1692930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1693930e68a5SMark Adams 
1694930e68a5SMark Adams   PetscFunctionBegin;
1695*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B));
1696*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*B,n,n,n,n));
1697930e68a5SMark Adams   (*B)->factortype = ftype;
1698*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1699*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(*B,MATSEQAIJKOKKOS));
1700930e68a5SMark Adams 
17018f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1702*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
170386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
170486a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
170586a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
1706*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
170786a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
170886a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
170998921bdaSJacob Faibussowitsch   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1710930e68a5SMark Adams 
1711*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1712*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos));
1713930e68a5SMark Adams   PetscFunctionReturn(0);
1714930e68a5SMark Adams }
17158f7e8f9dSMark Adams 
17168f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
17178f7e8f9dSMark Adams {
17188f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
17198f7e8f9dSMark Adams 
17208f7e8f9dSMark Adams   PetscFunctionBegin;
1721*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B));
1722*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*B,n,n,n,n));
17238f7e8f9dSMark Adams   (*B)->factortype = ftype;
1724f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1725*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1726*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(*B,MATSEQAIJKOKKOS));
17278f7e8f9dSMark Adams 
17288f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1729*5f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
17308f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
17318f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
17328f7e8f9dSMark Adams 
1733*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1734*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device));
17358f7e8f9dSMark Adams   PetscFunctionReturn(0);
17368f7e8f9dSMark Adams }
173786a27549SJunchao Zhang 
173886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
173986a27549SJunchao Zhang {
174086a27549SJunchao Zhang   PetscFunctionBegin;
1741*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos));
1742*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos));
1743*5f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device));
174486a27549SJunchao Zhang   PetscFunctionReturn(0);
174586a27549SJunchao Zhang }
174686a27549SJunchao Zhang 
1747076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */
1748076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat)
1749076ba34aSJunchao Zhang {
1750076ba34aSJunchao Zhang   const auto&       iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map);
1751076ba34aSJunchao Zhang   const auto&       jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries);
1752076ba34aSJunchao Zhang   const auto&       av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values);
1753076ba34aSJunchao Zhang   const PetscInt    *i = iv.data();
1754076ba34aSJunchao Zhang   const PetscInt    *j = jv.data();
1755076ba34aSJunchao Zhang   const PetscScalar *a = av.data();
1756076ba34aSJunchao Zhang   PetscInt          m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz();
1757076ba34aSJunchao Zhang 
1758076ba34aSJunchao Zhang   PetscFunctionBegin;
1759*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz));
1760076ba34aSJunchao Zhang   for (PetscInt k=0; k<m; k++) {
1761*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k));
1762076ba34aSJunchao Zhang     for (PetscInt p=i[k]; p<i[k+1]; p++) {
1763*5f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]));
1764076ba34aSJunchao Zhang     }
1765*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"\n"));
1766076ba34aSJunchao Zhang   }
1767076ba34aSJunchao Zhang   PetscFunctionReturn(0);
1768076ba34aSJunchao Zhang }
1769