xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision db78de30f30343e34deea67338c2133a9cb3c5fc)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
38c3ff71bSJunchao Zhang #include <petscsystypes.h>
48c3ff71bSJunchao Zhang #include <petscerror.h>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
1086a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
1186a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
1286a27549SJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
148c3ff71bSJunchao Zhang 
158c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
16a587d139SMark #include <petscmat.h>
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
218c3ff71bSJunchao Zhang {
228c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
23a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
248c3ff71bSJunchao Zhang 
258c3ff71bSJunchao Zhang   PetscFunctionBegin;
268c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
2786a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
28a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
29a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
30a587d139SMark   }
318c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
328c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
338c3ff71bSJunchao Zhang }
348c3ff71bSJunchao Zhang 
3586a27549SJunchao Zhang /* Sync CSR data to device if not yet */
368c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
378c3ff71bSJunchao Zhang {
38152b3e56SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
398c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
418c3ff71bSJunchao Zhang 
428c3ff71bSJunchao Zhang   PetscFunctionBegin;
4386a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
44152b3e56SJunchao Zhang   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
458c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
468c3ff71bSJunchao Zhang     delete aijkok;
478c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
488c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
498c3ff71bSJunchao Zhang     A->spptr             = aijkok;
508c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
5186a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
52152b3e56SJunchao Zhang     aijkok->a_dual.modify_host(); /* Mark host is modified */
53152b3e56SJunchao Zhang     aijkok->a_dual.sync_device(); /* Sync the device */
5486a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE;
5586a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
568c3ff71bSJunchao Zhang   }
578c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
588c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
598c3ff71bSJunchao Zhang }
608c3ff71bSJunchao Zhang 
6186a27549SJunchao Zhang /* Mark the CSR data on device is modified */
6286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
6386a27549SJunchao Zhang {
6486a27549SJunchao Zhang   PetscErrorCode   ierr;
6586a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
6686a27549SJunchao Zhang 
6786a27549SJunchao Zhang   PetscFunctionBegin;
6886a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
6986a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
7086a27549SJunchao Zhang   aijkok->a_dual.modify_device();
7186a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
7286a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
7386a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_GPU;
7486a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
7586a27549SJunchao Zhang   PetscFunctionReturn(0);
7686a27549SJunchao Zhang }
7786a27549SJunchao Zhang 
78f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
79f0cf5187SStefano Zampini {
80f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
81f0cf5187SStefano Zampini 
82f0cf5187SStefano Zampini   PetscFunctionBegin;
83f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
8486a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
8586a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
86f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
87f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
8886a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
8986a27549SJunchao Zhang     aijkok->a_dual.modify_device(); /* Mark device is modified */
9086a27549SJunchao Zhang     aijkok->a_dual.sync_host(); /* Sync the host */
91f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
92f0cf5187SStefano Zampini   }
93f0cf5187SStefano Zampini   PetscFunctionReturn(0);
94f0cf5187SStefano Zampini }
95f0cf5187SStefano Zampini 
96f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
97f0cf5187SStefano Zampini {
98f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
99f0cf5187SStefano Zampini   PetscErrorCode ierr;
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini   PetscFunctionBegin;
102f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
103f0cf5187SStefano Zampini   *array = a->a;
104f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
105f0cf5187SStefano Zampini   PetscFunctionReturn(0);
106f0cf5187SStefano Zampini }
107f0cf5187SStefano Zampini 
108a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
109a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
110a587d139SMark {
111a587d139SMark   Mat_SeqAIJKokkos *aijkok;
112a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
113a587d139SMark 
114a587d139SMark   PetscFunctionBegin;
115a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
116a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
117a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
118152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
119a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
120a587d139SMark   PetscFunctionReturn(0);
121a587d139SMark }
122a587d139SMark 
123a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
124a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
125a587d139SMark {
126a587d139SMark   Mat_SeqAIJKokkos *aijkok;
127a587d139SMark 
128a587d139SMark   PetscFunctionBegin;
129a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
131a587d139SMark     *d_mat = aijkok->device_mat_d.data();
132a587d139SMark   } else {
133a587d139SMark     PetscErrorCode   ierr;
134a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
135a587d139SMark     *d_mat  = NULL;
136a587d139SMark   }
137a587d139SMark   PetscFunctionReturn(0);
138a587d139SMark }
139152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
140152b3e56SJunchao Zhang {
141152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
142152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
143152b3e56SJunchao Zhang 
144152b3e56SJunchao Zhang   PetscFunctionBegin;
145152b3e56SJunchao Zhang   if (!aijkok->At) { /* Generate At for the first time */
146152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
147152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
14886a27549SJunchao Zhang   } else if (!aijkok->transpose_updated) { /* Only update At values */
149152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
150152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
151152b3e56SJunchao Zhang   }
15286a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_TRUE;
153152b3e56SJunchao Zhang   PetscFunctionReturn(0);
154152b3e56SJunchao Zhang }
155152b3e56SJunchao Zhang 
15686a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
157152b3e56SJunchao Zhang {
158152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
159152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160152b3e56SJunchao Zhang 
161152b3e56SJunchao Zhang   PetscFunctionBegin;
162152b3e56SJunchao Zhang   if (!aijkok->Ah) { /* Generate Ah for the first time */
163152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
164152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
165152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
16686a27549SJunchao Zhang   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
167152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
168152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
169152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
170152b3e56SJunchao Zhang   }
17186a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_TRUE;
172152b3e56SJunchao Zhang   PetscFunctionReturn(0);
173152b3e56SJunchao Zhang }
174a587d139SMark 
1758c3ff71bSJunchao Zhang /* y = A x */
1768c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1778c3ff71bSJunchao Zhang {
1788c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
180152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
181152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang   PetscFunctionBegin;
1848c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
186152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
1878c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
189152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
190152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
191bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
192152b3e56SJunchao Zhang   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
193152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
1948c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1958c3ff71bSJunchao Zhang }
1968c3ff71bSJunchao Zhang 
1978c3ff71bSJunchao Zhang /* y = A^T x */
1988c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1998c3ff71bSJunchao Zhang {
2008c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2018c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
202152b3e56SJunchao Zhang   Mat                              B;
203152b3e56SJunchao Zhang   const char                       *mode;
204152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
205152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang   PetscFunctionBegin;
2088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
210152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
211152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
212152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
213152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
214152b3e56SJunchao Zhang     mode = "N";
215152b3e56SJunchao Zhang   } else {
216152b3e56SJunchao Zhang     B    = A;
217152b3e56SJunchao Zhang     mode = "T";
218152b3e56SJunchao Zhang   }
219152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
220152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
221152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
222152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
223bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
224152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2258c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2268c3ff71bSJunchao Zhang }
2278c3ff71bSJunchao Zhang 
2288c3ff71bSJunchao Zhang /* y = A^H x */
2298c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2308c3ff71bSJunchao Zhang {
2318c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
233152b3e56SJunchao Zhang   Mat                              B;
234152b3e56SJunchao Zhang   const char                       *mode;
235152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
236152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2378c3ff71bSJunchao Zhang 
2388c3ff71bSJunchao Zhang   PetscFunctionBegin;
2398c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
240152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
241152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
242152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
24386a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
244152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
245152b3e56SJunchao Zhang     mode = "N";
246152b3e56SJunchao Zhang   } else {
247152b3e56SJunchao Zhang     B    = A;
248152b3e56SJunchao Zhang     mode = "C";
249152b3e56SJunchao Zhang   }
250152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
251152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
252152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
254bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
255152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2578c3ff71bSJunchao Zhang }
2588c3ff71bSJunchao Zhang 
2598c3ff71bSJunchao Zhang /* z = A x + y */
2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
2618c3ff71bSJunchao Zhang {
2628c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2638c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
264152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
265152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2668c3ff71bSJunchao Zhang 
2678c3ff71bSJunchao Zhang   PetscFunctionBegin;
2688c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
269152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
270152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
271152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2728c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2738c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
274152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
275152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
276152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
277152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
278bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
279152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2808c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2818c3ff71bSJunchao Zhang }
2828c3ff71bSJunchao Zhang 
2838c3ff71bSJunchao Zhang /* z = A^T x + y */
2848c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2858c3ff71bSJunchao Zhang {
2868c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2878c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
288152b3e56SJunchao Zhang   Mat                              B;
289152b3e56SJunchao Zhang   const char                       *mode;
290152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
291152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2928c3ff71bSJunchao Zhang 
2938c3ff71bSJunchao Zhang   PetscFunctionBegin;
2948c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
295152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
296152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
297152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2988c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
299152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
300152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
301152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
302152b3e56SJunchao Zhang     mode = "N";
303152b3e56SJunchao Zhang   } else {
304152b3e56SJunchao Zhang     B    = A;
305152b3e56SJunchao Zhang     mode = "T";
306152b3e56SJunchao Zhang   }
307152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
308152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
309152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
311152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
312bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
313152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3148c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3158c3ff71bSJunchao Zhang }
3168c3ff71bSJunchao Zhang 
3178c3ff71bSJunchao Zhang /* z = A^H x + y */
3188c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3198c3ff71bSJunchao Zhang {
3208c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3218c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
322152b3e56SJunchao Zhang   Mat                              B;
323152b3e56SJunchao Zhang   const char                       *mode;
324152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
325152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3268c3ff71bSJunchao Zhang 
3278c3ff71bSJunchao Zhang   PetscFunctionBegin;
3288c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
330152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
331152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3328c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
333152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
33486a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
335152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
336152b3e56SJunchao Zhang     mode = "N";
337152b3e56SJunchao Zhang   } else {
338152b3e56SJunchao Zhang     B    = A;
339152b3e56SJunchao Zhang     mode = "C";
340152b3e56SJunchao Zhang   }
341152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
342152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
343152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348152b3e56SJunchao Zhang   PetscFunctionReturn(0);
349152b3e56SJunchao Zhang }
350152b3e56SJunchao Zhang 
351152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
352152b3e56SJunchao Zhang {
353152b3e56SJunchao Zhang   PetscErrorCode            ierr;
354152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355152b3e56SJunchao Zhang 
356152b3e56SJunchao Zhang   PetscFunctionBegin;
357152b3e56SJunchao Zhang   switch (op) {
358152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
359152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
360152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
361152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
362152b3e56SJunchao Zhang       break;
363152b3e56SJunchao Zhang     default:
364152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
365152b3e56SJunchao Zhang       break;
366152b3e56SJunchao Zhang   }
3678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3688c3ff71bSJunchao Zhang }
3698c3ff71bSJunchao Zhang 
3703d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3718c3ff71bSJunchao Zhang {
3728c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3738c3ff71bSJunchao Zhang   Mat            B;
374c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
3758c3ff71bSJunchao Zhang 
3768c3ff71bSJunchao Zhang   PetscFunctionBegin;
377a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
3788c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
3798c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3808c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
3818c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   }
3838c3ff71bSJunchao Zhang 
3848c3ff71bSJunchao Zhang   B    = *newmat;
3858c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
38686a27549SJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
38786a27549SJunchao Zhang 
3888c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
3890d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
390f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
391c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
392c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
393c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3968c3ff71bSJunchao Zhang }
3978c3ff71bSJunchao Zhang 
3988c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
3998c3ff71bSJunchao Zhang {
4008c3ff71bSJunchao Zhang   PetscErrorCode ierr;
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang   PetscFunctionBegin;
4038c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
4048c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
4058c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4068c3ff71bSJunchao Zhang }
4078c3ff71bSJunchao Zhang 
4088c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
4098c3ff71bSJunchao Zhang {
4108c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
41186a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
4128c3ff71bSJunchao Zhang 
4138c3ff71bSJunchao Zhang   PetscFunctionBegin;
41486a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
41586a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
4168f7e8f9dSMark Adams     if (aijkok) {
4178f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
418a587d139SMark         delete aijkok->colmap_d;
419a587d139SMark         delete aijkok->i_uncompressed_d;
420a587d139SMark       }
4218f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
4228f7e8f9dSMark Adams     }
4238c3ff71bSJunchao Zhang     delete aijkok;
42486a27549SJunchao Zhang   } else {
42586a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
42686a27549SJunchao Zhang   }
427f0cf5187SStefano Zampini   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
428152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
429152b3e56SJunchao Zhang   A->spptr = NULL;
4308c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
4318c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4328c3ff71bSJunchao Zhang }
4338c3ff71bSJunchao Zhang 
43486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
43586a27549SJunchao Zhang {
43686a27549SJunchao Zhang   PetscErrorCode ierr;
43786a27549SJunchao Zhang 
43886a27549SJunchao Zhang   PetscFunctionBegin;
43986a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
44086a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
44186a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
44286a27549SJunchao Zhang   PetscFunctionReturn(0);
44386a27549SJunchao Zhang }
44486a27549SJunchao Zhang 
445a3f881fbSStefano Zampini #if 0
446a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
447a3f881fbSStefano Zampini {
448a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
449a3f881fbSStefano Zampini 
450a3f881fbSStefano Zampini   PetscFunctionBegin;
451a3f881fbSStefano Zampini   delete kh;
452a3f881fbSStefano Zampini   PetscFunctionReturn(0);
453a3f881fbSStefano Zampini }
454a3f881fbSStefano Zampini 
455a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
456a3f881fbSStefano Zampini {
457a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
458a3f881fbSStefano Zampini   Mat                  A,B;
459a3f881fbSStefano Zampini   MatProductType       ptype;
460a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
461a3f881fbSStefano Zampini   bool                 tA,tB;
462a3f881fbSStefano Zampini   PetscErrorCode       ierr;
463a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
464a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
465a3f881fbSStefano Zampini 
466a3f881fbSStefano Zampini   PetscFunctionBegin;
467a3f881fbSStefano Zampini   MatCheckProduct(C,1);
468a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
469a3f881fbSStefano Zampini   A = product->A;
470a3f881fbSStefano Zampini   B = product->B;
471a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
472a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
473a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
474a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
475a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
476a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
477a3f881fbSStefano Zampini   ptype = product->type;
478a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
479a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
480a3f881fbSStefano Zampini   switch (ptype) {
481a3f881fbSStefano Zampini   case MATPRODUCT_AB:
482a3f881fbSStefano Zampini     tA = false;
483a3f881fbSStefano Zampini     tB = false;
484a3f881fbSStefano Zampini     break;
485a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
486a3f881fbSStefano Zampini     tA = true;
487a3f881fbSStefano Zampini     tB = false;
488a3f881fbSStefano Zampini     break;
489a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
490a3f881fbSStefano Zampini     tA = false;
491a3f881fbSStefano Zampini     tB = true;
492a3f881fbSStefano Zampini     break;
493a3f881fbSStefano Zampini   default:
494a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
495a3f881fbSStefano Zampini   }
496a3f881fbSStefano Zampini 
497a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
498a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
499a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
500a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
501a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
502a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
503a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
504a3f881fbSStefano Zampini   c->reallocs         = 0;
505a3f881fbSStefano Zampini   C->info.mallocs    += 0;
506a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
507a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
508a3f881fbSStefano Zampini   C->num_ass++;
509a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
510a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
511a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
512a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
513a3f881fbSStefano Zampini   PetscFunctionReturn(0);
514a3f881fbSStefano Zampini }
515a3f881fbSStefano Zampini 
516a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
517a3f881fbSStefano Zampini {
518a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
519a3f881fbSStefano Zampini   Mat                  A,B;
520a3f881fbSStefano Zampini   MatProductType       ptype;
521a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
522a3f881fbSStefano Zampini   PetscInt             m,n,k;
523a3f881fbSStefano Zampini   bool                 tA,tB;
524a3f881fbSStefano Zampini   PetscErrorCode       ierr;
525a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
526a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
527a3f881fbSStefano Zampini 
528a3f881fbSStefano Zampini   PetscFunctionBegin;
529a3f881fbSStefano Zampini   MatCheckProduct(C,1);
530a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
531a3f881fbSStefano Zampini   A = product->A;
532a3f881fbSStefano Zampini   B = product->B;
533a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
534a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
535a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
536a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
538a3f881fbSStefano Zampini   ptype = product->type;
539a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
540a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
541a3f881fbSStefano Zampini   switch (ptype) {
542a3f881fbSStefano Zampini   case MATPRODUCT_AB:
543a3f881fbSStefano Zampini     tA = false;
544a3f881fbSStefano Zampini     tB = false;
545a3f881fbSStefano Zampini     m = A->rmap->n;
546a3f881fbSStefano Zampini     n = B->cmap->n;
547a3f881fbSStefano Zampini     break;
548a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
549a3f881fbSStefano Zampini     tA = true;
550a3f881fbSStefano Zampini     tB = false;
551a3f881fbSStefano Zampini     m = A->cmap->n;
552a3f881fbSStefano Zampini     n = B->cmap->n;
553a3f881fbSStefano Zampini     break;
554a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
555a3f881fbSStefano Zampini     tA = false;
556a3f881fbSStefano Zampini     tB = true;
557a3f881fbSStefano Zampini     m = A->rmap->n;
558a3f881fbSStefano Zampini     n = B->rmap->n;
559a3f881fbSStefano Zampini     break;
560a3f881fbSStefano Zampini   default:
561a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
562a3f881fbSStefano Zampini   }
563a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
564a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
565a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
566a3f881fbSStefano Zampini 
567a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
568a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
569a3f881fbSStefano Zampini   kh->set_team_work_size(16);
570a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
571a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
572a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
573a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
574a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
575a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
576a3f881fbSStefano Zampini 
577a3f881fbSStefano Zampini   /////////////////////////////////////
578a3f881fbSStefano Zampini   // TODO JZ
579a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
580a3f881fbSStefano Zampini   C->spptr = ckok;
581a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
582a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
583a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
584a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
585a3f881fbSStefano Zampini   //////////////////////////////////////
586a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
587a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
588a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
589a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
590a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
591a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
592a3f881fbSStefano Zampini   ////////////////////////////////////
593a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
594a3f881fbSStefano Zampini   ////////////////////////////////////
595a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
596a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
597a3f881fbSStefano Zampini   c->maxnz = c->nz;
598a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
599a3f881fbSStefano Zampini   c->rmax = 0;
600a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
601a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
602a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
603a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
604a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
605a3f881fbSStefano Zampini   }
606a3f881fbSStefano Zampini 
607a3f881fbSStefano Zampini   C->nonzerostate++;
608a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
609a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
610a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
611a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
612a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
613a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
614a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
615a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
616a3f881fbSStefano Zampini 
617a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
618a3f881fbSStefano Zampini   C->product->data = kh;
619a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
620a3f881fbSStefano Zampini   PetscFunctionReturn(0);
621a3f881fbSStefano Zampini }
622a3f881fbSStefano Zampini 
623a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
624a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
625a3f881fbSStefano Zampini {
626a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
627a3f881fbSStefano Zampini   PetscErrorCode ierr;
628a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
629a3f881fbSStefano Zampini 
630a3f881fbSStefano Zampini   PetscFunctionBegin;
631a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
632a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
633a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
634a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
635a3f881fbSStefano Zampini   }
636a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
637a3f881fbSStefano Zampini     switch (product->type) {
638a3f881fbSStefano Zampini     case MATPRODUCT_AB:
639a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
640a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
641a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
642a3f881fbSStefano Zampini       break;
643a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
644a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
645a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
646a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
647a3f881fbSStefano Zampini       break;
648a3f881fbSStefano Zampini     default:
649a3f881fbSStefano Zampini       break;
650a3f881fbSStefano Zampini     }
651a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
652a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
653a3f881fbSStefano Zampini   }
654a3f881fbSStefano Zampini   PetscFunctionReturn(0);
655a3f881fbSStefano Zampini }
656a3f881fbSStefano Zampini #endif
657a3f881fbSStefano Zampini 
658a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
659a587d139SMark {
660a587d139SMark   PetscErrorCode    ierr;
661a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
662a587d139SMark   PetscFunctionBegin;
66386a27549SJunchao Zhang   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
664a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
665a587d139SMark   PetscFunctionReturn(0);
666a587d139SMark }
667a587d139SMark 
668f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
669f0cf5187SStefano Zampini {
670f0cf5187SStefano Zampini   PetscErrorCode   ierr;
671f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
672f0cf5187SStefano Zampini 
673f0cf5187SStefano Zampini   PetscFunctionBegin;
674f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
675f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
676f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
677f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
678f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
679f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
680f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
681f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
682f0cf5187SStefano Zampini   PetscFunctionReturn(0);
683f0cf5187SStefano Zampini }
684f0cf5187SStefano Zampini 
685a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
686a587d139SMark {
687a587d139SMark   PetscErrorCode   ierr;
688a587d139SMark   PetscBool        both = PETSC_FALSE;
689a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
690a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
691a587d139SMark 
692a587d139SMark   PetscFunctionBegin;
693a587d139SMark   if (aijkok && aijkok->a_d.data()) {
694f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
695a587d139SMark     both = PETSC_TRUE;
696a587d139SMark   }
697a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
698a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
699a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
700a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
701a587d139SMark   PetscFunctionReturn(0);
702a587d139SMark }
703a587d139SMark 
704*db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
705*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
706*db78de30SJunchao Zhang {
707*db78de30SJunchao Zhang   PetscErrorCode     ierr;
708*db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
709*db78de30SJunchao Zhang 
710*db78de30SJunchao Zhang   PetscFunctionBegin;
711*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
712*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
713*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
714*db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
715*db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
716*db78de30SJunchao Zhang   *kv    = aijkok->a_d;
717*db78de30SJunchao Zhang   PetscFunctionReturn(0);
718*db78de30SJunchao Zhang }
719*db78de30SJunchao Zhang 
720*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv)
721*db78de30SJunchao Zhang {
722*db78de30SJunchao Zhang   PetscFunctionBegin;
723*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
724*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
725*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
726*db78de30SJunchao Zhang   PetscFunctionReturn(0);
727*db78de30SJunchao Zhang }
728*db78de30SJunchao Zhang 
729*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv)
730*db78de30SJunchao Zhang {
731*db78de30SJunchao Zhang   PetscErrorCode     ierr;
732*db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
733*db78de30SJunchao Zhang 
734*db78de30SJunchao Zhang   PetscFunctionBegin;
735*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
736*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
737*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
738*db78de30SJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
739*db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
740*db78de30SJunchao Zhang   *kv    = aijkok->a_d;
741*db78de30SJunchao Zhang   PetscFunctionReturn(0);
742*db78de30SJunchao Zhang }
743*db78de30SJunchao Zhang 
744*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv)
745*db78de30SJunchao Zhang {
746*db78de30SJunchao Zhang   PetscErrorCode     ierr;
747*db78de30SJunchao Zhang 
748*db78de30SJunchao Zhang   PetscFunctionBegin;
749*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
750*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
751*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
752*db78de30SJunchao Zhang   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
753*db78de30SJunchao Zhang   PetscFunctionReturn(0);
754*db78de30SJunchao Zhang }
755*db78de30SJunchao Zhang 
756*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
757*db78de30SJunchao Zhang {
758*db78de30SJunchao Zhang   Mat_SeqAIJKokkos   *aijkok;
759*db78de30SJunchao Zhang 
760*db78de30SJunchao Zhang   PetscFunctionBegin;
761*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
762*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
763*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
764*db78de30SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
765*db78de30SJunchao Zhang   *kv    = aijkok->a_d;
766*db78de30SJunchao Zhang   PetscFunctionReturn(0);
767*db78de30SJunchao Zhang }
768*db78de30SJunchao Zhang 
769*db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv)
770*db78de30SJunchao Zhang {
771*db78de30SJunchao Zhang   PetscErrorCode     ierr;
772*db78de30SJunchao Zhang 
773*db78de30SJunchao Zhang   PetscFunctionBegin;
774*db78de30SJunchao Zhang   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
775*db78de30SJunchao Zhang   PetscValidPointer(kv,2);
776*db78de30SJunchao Zhang   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
777*db78de30SJunchao Zhang   ierr = MatSeqAIJKokkosSetDeviceModified(A);CHKERRQ(ierr);
778*db78de30SJunchao Zhang   PetscFunctionReturn(0);
779*db78de30SJunchao Zhang }
780*db78de30SJunchao Zhang 
781*db78de30SJunchao Zhang /* Computes Y += a*X */
782f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
783a587d139SMark {
784a587d139SMark   PetscErrorCode ierr;
785a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
786a587d139SMark 
787a587d139SMark   PetscFunctionBegin;
788f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
789a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
790a587d139SMark     PetscFunctionReturn(0);
791a587d139SMark   }
792*db78de30SJunchao Zhang 
793f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
794a587d139SMark     PetscBool e;
795a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
796a587d139SMark     if (e) {
797a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
798f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
799a587d139SMark     }
800a587d139SMark   }
801*db78de30SJunchao Zhang 
802a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
803*db78de30SJunchao Zhang     /* TODO: do MatAXPY on device */
804a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
805a587d139SMark     PetscFunctionReturn(0);
806a587d139SMark   } else {
807*db78de30SJunchao Zhang     ConstPetscScalarKokkosView xv;
808*db78de30SJunchao Zhang     PetscScalarKokkosView      yv;
809*db78de30SJunchao Zhang 
810*db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr);
811*db78de30SJunchao Zhang     ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr);
812*db78de30SJunchao Zhang     KokkosBlas::axpy(a,xv,yv);
813*db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr);
814*db78de30SJunchao Zhang     ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr);
815a587d139SMark   }
816a587d139SMark   PetscFunctionReturn(0);
817a587d139SMark }
818a587d139SMark 
81986a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
8208f7e8f9dSMark Adams {
8218f7e8f9dSMark Adams   PetscErrorCode ierr;
8228f7e8f9dSMark Adams 
8238f7e8f9dSMark Adams   PetscFunctionBegin;
8248f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
8258f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
8268f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
8278f7e8f9dSMark Adams   PetscFunctionReturn(0);
8288f7e8f9dSMark Adams }
8298f7e8f9dSMark Adams 
8308c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
8318c3ff71bSJunchao Zhang {
8328c3ff71bSJunchao Zhang   PetscFunctionBegin;
8336f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
8346f3d89d0SStefano Zampini 
835a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
8368c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
8378c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
8388c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
839a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
840f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
841a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
842a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
8438c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
8448c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
8458c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
8468c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
8478c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
8488c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
849152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
8508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
8518c3ff71bSJunchao Zhang }
8528c3ff71bSJunchao Zhang 
8538c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
854152b3e56SJunchao Zhang /*@C
8558c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
8568c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
8578c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
8588c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
8598c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
8608c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
8618c3ff71bSJunchao Zhang 
8628c3ff71bSJunchao Zhang    Collective
8638c3ff71bSJunchao Zhang 
8648c3ff71bSJunchao Zhang    Input Parameters:
8658c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
8668c3ff71bSJunchao Zhang .  m - number of rows
8678c3ff71bSJunchao Zhang .  n - number of columns
8688c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
8698c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
8708c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
8718c3ff71bSJunchao Zhang 
8728c3ff71bSJunchao Zhang    Output Parameter:
8738c3ff71bSJunchao Zhang .  A - the matrix
8748c3ff71bSJunchao Zhang 
8758c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8768c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8778c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8788c3ff71bSJunchao Zhang 
8798c3ff71bSJunchao Zhang    Notes:
8808c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
8818c3ff71bSJunchao Zhang 
8828c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
8838c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
8848c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
8858c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
8868c3ff71bSJunchao Zhang 
8878c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
8888c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8898c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
8908c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
8918c3ff71bSJunchao Zhang 
8928c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
8938c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
8948c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
8958c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
8968c3ff71bSJunchao Zhang 
8978c3ff71bSJunchao Zhang    Level: intermediate
8988c3ff71bSJunchao Zhang 
8998c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
9008c3ff71bSJunchao Zhang @*/
9018c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
9028c3ff71bSJunchao Zhang {
9038c3ff71bSJunchao Zhang   PetscErrorCode ierr;
9048c3ff71bSJunchao Zhang 
9058c3ff71bSJunchao Zhang   PetscFunctionBegin;
9068c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
9078c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
9088c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
9098c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
9108c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
9118c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
9128c3ff71bSJunchao Zhang }
913930e68a5SMark Adams 
914930e68a5SMark Adams // factorizations
9158f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
9168f7e8f9dSMark Adams //
9178f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
9188f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
9198f7e8f9dSMark Adams //
9208f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
921930e68a5SMark Adams {
9228f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
9238f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
9248f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
925930e68a5SMark Adams   PetscErrorCode     ierr;
9268f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
9278f7e8f9dSMark Adams   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
9288f7e8f9dSMark Adams   const PetscScalar  *aa_d = aijkok->a_d.data();
9298f7e8f9dSMark Adams   PetscScalar        *ba_d = baijkok->a_d.data();
9308f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
9318f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
9328f7e8f9dSMark Adams   PetscContainer     container;
933930e68a5SMark Adams 
934930e68a5SMark Adams   PetscFunctionBegin;
9358f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
9368f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
9378f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
9388f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
9398f7e8f9dSMark Adams   if (container) {
9408f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
9418f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
9428f7e8f9dSMark Adams     Nf = (*pNf)%1000;
9438f7e8f9dSMark Adams     nv = (*pNf)/1000;
9448f7e8f9dSMark Adams     if (nv>0) nVec = nv;
9458f7e8f9dSMark Adams   } else Nf = 1;
9468f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
9478f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
9488f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
9498f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
9508f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
9518f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
9528f7e8f9dSMark Adams #endif
9538f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
9548f7e8f9dSMark Adams   {
9558f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
9568f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
9578f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
9588f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
9598f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
9608f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
9618f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
9628f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
9638f7e8f9dSMark Adams     size_t flops_h = 0.0;
9648f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
9658f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
9668f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
9678f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
9688f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
9698f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
9708f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
9718f7e8f9dSMark Adams     // Fill A --> fact
9728f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
9738f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
9748f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
9758f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
9768f7e8f9dSMark Adams         // zero rows of B
9778f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9788f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
9798f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
9808f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
9818f7e8f9dSMark Adams             /* zero (unfactored row) */
9828f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
9838f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
9848f7e8f9dSMark Adams           });
9858f7e8f9dSMark Adams         // copy A into B
9868f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9878f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
9888f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
9898f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
9908f7e8f9dSMark Adams             /* load in initial (unfactored row) */
9918f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
9928f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
9938f7e8f9dSMark Adams               PetscScalar vala = av[j];
9948f7e8f9dSMark Adams               if (colb == rowb) {
9958f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
9968f7e8f9dSMark Adams               } else {
9978f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9988f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9998f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
10008f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
10018f7e8f9dSMark Adams                   if (pbj[j] == colb) {
10028f7e8f9dSMark Adams                     pba[j] = vala;
10038f7e8f9dSMark Adams                     set++;
10048f7e8f9dSMark Adams                     break;
10058f7e8f9dSMark Adams                   }
10068f7e8f9dSMark Adams                 }
10078f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
10088f7e8f9dSMark Adams               }
10098f7e8f9dSMark Adams             }
10108f7e8f9dSMark Adams           });
10118f7e8f9dSMark Adams       });
10128f7e8f9dSMark Adams     Kokkos::fence();
1013930e68a5SMark Adams 
10148f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
10158f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
10168f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
10178f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
10188f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
10198f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
10208f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
10218f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
10228f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
10238f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
10248f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
10258f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
10268f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
10278f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
10288f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
10298f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
10308f7e8f9dSMark Adams               else {
10318f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
10328f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
10338f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
10348f7e8f9dSMark Adams                 size_t         st_idx;
10358f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
10368f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
10378f7e8f9dSMark Adams                 // get column, there has got to be a better way
10388f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
10398f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
10408f7e8f9dSMark Adams                     if (pjL[j] == ii) {
10418f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
10428f7e8f9dSMark Adams                       idx = j; // output
10438f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
10448f7e8f9dSMark Adams                     }
10458f7e8f9dSMark Adams                   }, st_idx);
10468f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
10478f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
10488f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
10498f7e8f9dSMark Adams                   // U(i+1,:end)
10508f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
10518f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
10528f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
10538f7e8f9dSMark Adams                       if (col==myk) {
10548f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
10558f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
10568f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
10578f7e8f9dSMark Adams                       } else {
10588f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
10598f7e8f9dSMark Adams                         PetscInt       high, low;
10608f7e8f9dSMark Adams                         const PetscInt *startj;
10618f7e8f9dSMark Adams                         if (col<myk) { // L
10628f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
10638f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
10648f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
10658f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
10668f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
10678f7e8f9dSMark Adams                         } else {
10688f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
10698f7e8f9dSMark Adams                           start = ba_d + idx;
10708f7e8f9dSMark Adams                           startj= bj_d + idx;
10718f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
10728f7e8f9dSMark Adams                         }
10738f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
10748f7e8f9dSMark Adams                         low  = 0;
10758f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
10768f7e8f9dSMark Adams                         while (high-low > 5) {
10778f7e8f9dSMark Adams                           int t = (low+high)/2;
10788f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
10798f7e8f9dSMark Adams                           else                 low  = t;
10808f7e8f9dSMark Adams                         }
10818f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
10828f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
10838f7e8f9dSMark Adams                         }
10848f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
10858f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
10868f7e8f9dSMark Adams                       }
10878f7e8f9dSMark Adams                     });
10888f7e8f9dSMark Adams                 }
10898f7e8f9dSMark Adams               }
10908f7e8f9dSMark Adams             });
10918f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
10928f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
10938f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
10948f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
10958f7e8f9dSMark Adams       });
10968f7e8f9dSMark Adams     Kokkos::fence();
10978f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
10988f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
10998f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
11008f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
11018f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
11028f7e8f9dSMark Adams #endif
11038f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
11048f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
11058f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
11068f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
11078f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
11088f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
11098f7e8f9dSMark Adams             if (i < end) {
11108f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
11118f7e8f9dSMark Adams               *pv = 1.0/(*pv);
11128f7e8f9dSMark Adams             }
11138f7e8f9dSMark Adams           });
11148f7e8f9dSMark Adams       });
11158f7e8f9dSMark Adams   }
11168f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
11178f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
11188f7e8f9dSMark Adams #endif
11198f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
11208f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
11218f7e8f9dSMark Adams 
11228f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
11238f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
11248f7e8f9dSMark Adams   if (b->inode.size) {
11258f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
11268f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
11278f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
11288f7e8f9dSMark Adams   } else {
11298f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
11308f7e8f9dSMark Adams   }
11318f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
11328f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
11338f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
11348f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
11358f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
11368f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
11378f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
11388f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
11398f7e8f9dSMark Adams 
1140930e68a5SMark Adams   PetscFunctionReturn(0);
1141930e68a5SMark Adams }
1142930e68a5SMark Adams 
114386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1144930e68a5SMark Adams {
1145930e68a5SMark Adams   PetscErrorCode   ierr;
1146930e68a5SMark Adams 
1147930e68a5SMark Adams   PetscFunctionBegin;
1148930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
114986a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
115086a27549SJunchao Zhang   PetscFunctionReturn(0);
115186a27549SJunchao Zhang }
115286a27549SJunchao Zhang 
115386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
115486a27549SJunchao Zhang {
115586a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
115686a27549SJunchao Zhang 
115786a27549SJunchao Zhang   PetscFunctionBegin;
115886a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
115986a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
116086a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
116186a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
116286a27549SJunchao Zhang   }
116386a27549SJunchao Zhang   PetscFunctionReturn(0);
116486a27549SJunchao Zhang }
116586a27549SJunchao Zhang 
116686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
116786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
116886a27549SJunchao Zhang {
116986a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
117086a27549SJunchao Zhang   MatColumnIndexType             n = A->rmap->n;
117186a27549SJunchao Zhang 
117286a27549SJunchao Zhang   PetscFunctionBegin;
117386a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
117486a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
117586a27549SJunchao Zhang     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
117686a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
117786a27549SJunchao Zhang     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
117886a27549SJunchao Zhang     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
117986a27549SJunchao Zhang 
118086a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
118186a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
118286a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
118386a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
118486a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
118586a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
118686a27549SJunchao Zhang 
118786a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
118886a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
118986a27549SJunchao Zhang     */
119086a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
119186a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
119286a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
119386a27549SJunchao Zhang 
119486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
119586a27549SJunchao Zhang 
119686a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
119786a27549SJunchao Zhang     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
119886a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
119986a27549SJunchao Zhang     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
120086a27549SJunchao Zhang     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
120186a27549SJunchao Zhang 
120286a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
120386a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
120486a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
120586a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
120686a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
120786a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
120886a27549SJunchao Zhang 
120986a27549SJunchao Zhang     /* Sort indices. See comments above */
121086a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
121186a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
121286a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
121386a27549SJunchao Zhang 
121486a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
121586a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
121686a27549SJunchao Zhang   }
121786a27549SJunchao Zhang   PetscFunctionReturn(0);
121886a27549SJunchao Zhang }
121986a27549SJunchao Zhang 
122086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
122186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
122286a27549SJunchao Zhang {
122386a27549SJunchao Zhang   PetscErrorCode                 ierr;
122486a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
122586a27549SJunchao Zhang   PetscScalarKokkosView          xv;
122686a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
122786a27549SJunchao Zhang 
122886a27549SJunchao Zhang   PetscFunctionBegin;
122986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
123086a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
123186a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
123286a27549SJunchao Zhang   /* Solve L tmpv = b */
123386a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
123486a27549SJunchao Zhang   /* Solve Ux = tmpv */
123586a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
123686a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
123786a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
123886a27549SJunchao Zhang   PetscFunctionReturn(0);
123986a27549SJunchao Zhang }
124086a27549SJunchao Zhang 
124186a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */
124286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
124386a27549SJunchao Zhang {
124486a27549SJunchao Zhang   PetscErrorCode                 ierr;
124586a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
124686a27549SJunchao Zhang   PetscScalarKokkosView          xv;
124786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
124886a27549SJunchao Zhang 
124986a27549SJunchao Zhang   PetscFunctionBegin;
125086a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
125186a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
125286a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
125386a27549SJunchao Zhang   /* Solve U^T tmpv = b */
125486a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
125586a27549SJunchao Zhang 
125686a27549SJunchao Zhang   /* Solve L^T x = tmpv */
125786a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
125886a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
125986a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
126086a27549SJunchao Zhang   PetscFunctionReturn(0);
126186a27549SJunchao Zhang }
126286a27549SJunchao Zhang 
126386a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
126486a27549SJunchao Zhang {
126586a27549SJunchao Zhang   PetscErrorCode                 ierr;
126686a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
126786a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
126886a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
126986a27549SJunchao Zhang 
127086a27549SJunchao Zhang   PetscFunctionBegin;
127186a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
127286a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
127386a27549SJunchao Zhang 
127486a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
127586a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
127686a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
127786a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
127886a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
127986a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
128086a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
128186a27549SJunchao Zhang 
128286a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
128386a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
128486a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
128586a27549SJunchao Zhang   PetscFunctionReturn(0);
128686a27549SJunchao Zhang }
128786a27549SJunchao Zhang 
128886a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
128986a27549SJunchao Zhang {
129086a27549SJunchao Zhang   PetscErrorCode                 ierr;
129186a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
129286a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
129386a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
129486a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
129586a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
129686a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
129786a27549SJunchao Zhang 
129886a27549SJunchao Zhang   PetscFunctionBegin;
129986a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
130086a27549SJunchao Zhang   /* Rebuild factors */
130186a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
130286a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
130386a27549SJunchao Zhang 
130486a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
130586a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
130686a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
130786a27549SJunchao Zhang 
130886a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
130986a27549SJunchao Zhang 
131086a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
131186a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
131286a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
131386a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
131486a27549SJunchao Zhang 
131586a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
131686a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
131786a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
131886a27549SJunchao Zhang 
131986a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
132086a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
132186a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
132286a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
132386a27549SJunchao Zhang 
132486a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
132586a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
132686a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
132786a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
132886a27549SJunchao Zhang  #else
132986a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
133086a27549SJunchao Zhang  #endif
133186a27549SJunchao Zhang 
133286a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
133386a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
133486a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
133586a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
133686a27549SJunchao Zhang 
133786a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
133886a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
133986a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
134086a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
134186a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
134286a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
134386a27549SJunchao Zhang 
134486a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
134586a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1346930e68a5SMark Adams   PetscFunctionReturn(0);
1347930e68a5SMark Adams }
1348930e68a5SMark Adams 
13498f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
13508f7e8f9dSMark Adams {
13518f7e8f9dSMark Adams   PetscErrorCode   ierr;
13528f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
13538f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1354930e68a5SMark Adams 
13558f7e8f9dSMark Adams   PetscFunctionBegin;
13568f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
13578f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
13588f7e8f9dSMark Adams   // move B data into Kokkos
13598f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
13608f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
13618f7e8f9dSMark Adams   {
13628f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
13638f7e8f9dSMark Adams     if (!baijkok->diag_d) {
13648f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1365152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
13668f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
13678f7e8f9dSMark Adams     }
13688f7e8f9dSMark Adams   }
13698f7e8f9dSMark Adams   PetscFunctionReturn(0);
13708f7e8f9dSMark Adams }
13718f7e8f9dSMark Adams 
137286a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1373930e68a5SMark Adams {
1374930e68a5SMark Adams   PetscFunctionBegin;
1375930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1376930e68a5SMark Adams   PetscFunctionReturn(0);
1377930e68a5SMark Adams }
1378930e68a5SMark Adams 
13798f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice
13808f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
13818f7e8f9dSMark Adams {
13828f7e8f9dSMark Adams   PetscFunctionBegin;
13838f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
13848f7e8f9dSMark Adams   PetscFunctionReturn(0);
13858f7e8f9dSMark Adams }
13868f7e8f9dSMark Adams 
1387930e68a5SMark Adams /*MC
138886a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
138986a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1390930e68a5SMark Adams 
1391930e68a5SMark Adams   Level: beginner
1392930e68a5SMark Adams 
139386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1394930e68a5SMark Adams M*/
139586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1396930e68a5SMark Adams {
1397930e68a5SMark Adams   PetscErrorCode ierr;
1398930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1399930e68a5SMark Adams 
1400930e68a5SMark Adams   PetscFunctionBegin;
1401930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1402930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1403930e68a5SMark Adams   (*B)->factortype = ftype;
14044ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1405930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1406930e68a5SMark Adams 
14078f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1408930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
140986a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
141086a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
141186a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
141286a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
141386a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
141486a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
141586a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1416930e68a5SMark Adams 
1417930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
141886a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1419930e68a5SMark Adams   PetscFunctionReturn(0);
1420930e68a5SMark Adams }
14218f7e8f9dSMark Adams 
14228f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
14238f7e8f9dSMark Adams {
14248f7e8f9dSMark Adams   PetscErrorCode ierr;
14258f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
14268f7e8f9dSMark Adams 
14278f7e8f9dSMark Adams   PetscFunctionBegin;
14288f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
14298f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
14308f7e8f9dSMark Adams   (*B)->factortype = ftype;
1431f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
14324ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
14338f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
14348f7e8f9dSMark Adams 
14358f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
14368f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
14378f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
14388f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
14398f7e8f9dSMark Adams 
14408f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
14418f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
14428f7e8f9dSMark Adams   PetscFunctionReturn(0);
14438f7e8f9dSMark Adams }
144486a27549SJunchao Zhang 
144586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
144686a27549SJunchao Zhang {
144786a27549SJunchao Zhang   PetscErrorCode ierr;
144886a27549SJunchao Zhang 
144986a27549SJunchao Zhang   PetscFunctionBegin;
145086a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
145186a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
145286a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
145386a27549SJunchao Zhang   PetscFunctionReturn(0);
145486a27549SJunchao Zhang }
145586a27549SJunchao Zhang 
1456