xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 86a275494187766c14dcdd7cc25ffb65d86fd341)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
2152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h>
38c3ff71bSJunchao Zhang #include <petscsystypes.h>
48c3ff71bSJunchao Zhang #include <petscerror.h>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
10*86a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp>
11*86a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp>
12*86a27549SJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
148c3ff71bSJunchao Zhang 
158c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
16a587d139SMark #include <petscmat.h>
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
198c3ff71bSJunchao Zhang 
208c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
218c3ff71bSJunchao Zhang {
228c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
23a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
248c3ff71bSJunchao Zhang 
258c3ff71bSJunchao Zhang   PetscFunctionBegin;
268c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
27*86a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
28a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
29a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
30a587d139SMark   }
318c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
328c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
338c3ff71bSJunchao Zhang }
348c3ff71bSJunchao Zhang 
35*86a27549SJunchao Zhang /* Sync CSR data to device if not yet */
368c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
378c3ff71bSJunchao Zhang {
38152b3e56SJunchao Zhang   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ*>(A->data);
398c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
408c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
418c3ff71bSJunchao Zhang 
428c3ff71bSJunchao Zhang   PetscFunctionBegin;
43*86a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device");
44152b3e56SJunchao Zhang   /* If aijkok is not built yet OR the nonzero pattern on CPU has changed, build aijkok from the scratch */
458c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
468c3ff71bSJunchao Zhang     delete aijkok;
478c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
488c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
498c3ff71bSJunchao Zhang     A->spptr             = aijkok;
508c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
51*86a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
52152b3e56SJunchao Zhang     aijkok->a_dual.modify_host(); /* Mark host is modified */
53152b3e56SJunchao Zhang     aijkok->a_dual.sync_device(); /* Sync the device */
54*86a27549SJunchao Zhang     aijkok->transpose_updated = PETSC_FALSE;
55*86a27549SJunchao Zhang     aijkok->hermitian_updated = PETSC_FALSE;
568c3ff71bSJunchao Zhang   }
578c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
588c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
598c3ff71bSJunchao Zhang }
608c3ff71bSJunchao Zhang 
61*86a27549SJunchao Zhang /* Mark the CSR data on device is modified */
62*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSetDeviceModified(Mat A)
63*86a27549SJunchao Zhang {
64*86a27549SJunchao Zhang   PetscErrorCode   ierr;
65*86a27549SJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
66*86a27549SJunchao Zhang 
67*86a27549SJunchao Zhang   PetscFunctionBegin;
68*86a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries");
69*86a27549SJunchao Zhang   aijkok->a_dual.clear_sync_state();
70*86a27549SJunchao Zhang   aijkok->a_dual.modify_device();
71*86a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_FALSE;
72*86a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_FALSE;
73*86a27549SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_GPU;
74*86a27549SJunchao Zhang   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
75*86a27549SJunchao Zhang   PetscFunctionReturn(0);
76*86a27549SJunchao Zhang }
77*86a27549SJunchao Zhang 
78f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
79f0cf5187SStefano Zampini {
80f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
81f0cf5187SStefano Zampini 
82f0cf5187SStefano Zampini   PetscFunctionBegin;
83f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
84*86a27549SJunchao Zhang    /* We do not expect one needs factors on host  */
85*86a27549SJunchao Zhang   if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host");
86f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
87f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
88*86a27549SJunchao Zhang     aijkok->a_dual.clear_sync_state();
89*86a27549SJunchao Zhang     aijkok->a_dual.modify_device(); /* Mark device is modified */
90*86a27549SJunchao Zhang     aijkok->a_dual.sync_host(); /* Sync the host */
91f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
92f0cf5187SStefano Zampini   }
93f0cf5187SStefano Zampini   PetscFunctionReturn(0);
94f0cf5187SStefano Zampini }
95f0cf5187SStefano Zampini 
96f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
97f0cf5187SStefano Zampini {
98f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
99f0cf5187SStefano Zampini   PetscErrorCode ierr;
100f0cf5187SStefano Zampini 
101f0cf5187SStefano Zampini   PetscFunctionBegin;
102f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
103f0cf5187SStefano Zampini   *array = a->a;
104f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
105f0cf5187SStefano Zampini   PetscFunctionReturn(0);
106f0cf5187SStefano Zampini }
107f0cf5187SStefano Zampini 
108a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
109a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
110a587d139SMark {
111a587d139SMark   Mat_SeqAIJKokkos *aijkok;
112a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
113a587d139SMark 
114a587d139SMark   PetscFunctionBegin;
115a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
116a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
117a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
118152b3e56SJunchao Zhang   aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k);
119a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
120a587d139SMark   PetscFunctionReturn(0);
121a587d139SMark }
122a587d139SMark 
123a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
124a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
125a587d139SMark {
126a587d139SMark   Mat_SeqAIJKokkos *aijkok;
127a587d139SMark 
128a587d139SMark   PetscFunctionBegin;
129a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
130a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
131a587d139SMark     *d_mat = aijkok->device_mat_d.data();
132a587d139SMark   } else {
133a587d139SMark     PetscErrorCode   ierr;
134a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
135a587d139SMark     *d_mat  = NULL;
136a587d139SMark   }
137a587d139SMark   PetscFunctionReturn(0);
138a587d139SMark }
139152b3e56SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose(Mat A)
140152b3e56SJunchao Zhang {
141152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
142152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
143152b3e56SJunchao Zhang 
144152b3e56SJunchao Zhang   PetscFunctionBegin;
145152b3e56SJunchao Zhang   if (!aijkok->At) { /* Generate At for the first time */
146152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->At);CHKERRQ(ierr);
147152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
148*86a27549SJunchao Zhang   } else if (!aijkok->transpose_updated) { /* Only update At values */
149152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->At);CHKERRQ(ierr);
150152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->At);CHKERRQ(ierr);
151152b3e56SJunchao Zhang   }
152*86a27549SJunchao Zhang   aijkok->transpose_updated = PETSC_TRUE;
153152b3e56SJunchao Zhang   PetscFunctionReturn(0);
154152b3e56SJunchao Zhang }
155152b3e56SJunchao Zhang 
156*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian(Mat A)
157152b3e56SJunchao Zhang {
158152b3e56SJunchao Zhang   PetscErrorCode                   ierr;
159152b3e56SJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
160152b3e56SJunchao Zhang 
161152b3e56SJunchao Zhang   PetscFunctionBegin;
162152b3e56SJunchao Zhang   if (!aijkok->Ah) { /* Generate Ah for the first time */
163152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_INITIAL_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
164152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
165152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
166*86a27549SJunchao Zhang   } else if (!aijkok->hermitian_updated) { /* Only update Ah values */
167152b3e56SJunchao Zhang     ierr = MatTranspose(A,MAT_REUSE_MATRIX,&aijkok->Ah);CHKERRQ(ierr);
168152b3e56SJunchao Zhang     ierr = MatConjugate(aijkok->Ah);CHKERRQ(ierr);
169152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosSyncDevice(aijkok->Ah);CHKERRQ(ierr);
170152b3e56SJunchao Zhang   }
171*86a27549SJunchao Zhang   aijkok->hermitian_updated = PETSC_TRUE;
172152b3e56SJunchao Zhang   PetscFunctionReturn(0);
173152b3e56SJunchao Zhang }
174a587d139SMark 
1758c3ff71bSJunchao Zhang /* y = A x */
1768c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1778c3ff71bSJunchao Zhang {
1788c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
180152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
181152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang   PetscFunctionBegin;
1848c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
185152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
186152b3e56SJunchao Zhang   ierr   = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
1878c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
188152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
189152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
190152b3e56SJunchao Zhang   ierr   = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
191bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
192152b3e56SJunchao Zhang   /* 2.0*aijkok->csrmat.nnz()-aijkok->csrmat.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
193152b3e56SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
1948c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1958c3ff71bSJunchao Zhang }
1968c3ff71bSJunchao Zhang 
1978c3ff71bSJunchao Zhang /* y = A^T x */
1988c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1998c3ff71bSJunchao Zhang {
2008c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2018c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
202152b3e56SJunchao Zhang   Mat                              B;
203152b3e56SJunchao Zhang   const char                       *mode;
204152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
205152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang   PetscFunctionBegin;
2088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
209152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
210152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
211152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
212152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
213152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
214152b3e56SJunchao Zhang     mode = "N";
215152b3e56SJunchao Zhang   } else {
216152b3e56SJunchao Zhang     B    = A;
217152b3e56SJunchao Zhang     mode = "T";
218152b3e56SJunchao Zhang   }
219152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
220152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
221152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
222152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
223bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
224152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2258c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2268c3ff71bSJunchao Zhang }
2278c3ff71bSJunchao Zhang 
2288c3ff71bSJunchao Zhang /* y = A^H x */
2298c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
2308c3ff71bSJunchao Zhang {
2318c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2328c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
233152b3e56SJunchao Zhang   Mat                              B;
234152b3e56SJunchao Zhang   const char                       *mode;
235152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv;
236152b3e56SJunchao Zhang   PetscScalarKokkosView            yv;
2378c3ff71bSJunchao Zhang 
2388c3ff71bSJunchao Zhang   PetscFunctionBegin;
2398c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
240152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
241152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
242152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
243*86a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
244152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
245152b3e56SJunchao Zhang     mode = "N";
246152b3e56SJunchao Zhang   } else {
247152b3e56SJunchao Zhang     B    = A;
248152b3e56SJunchao Zhang     mode = "C";
249152b3e56SJunchao Zhang   }
250152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
251152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
252152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
253152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
254bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
255152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2568c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2578c3ff71bSJunchao Zhang }
2588c3ff71bSJunchao Zhang 
2598c3ff71bSJunchao Zhang /* z = A x + y */
2608c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
2618c3ff71bSJunchao Zhang {
2628c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2638c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
264152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
265152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2668c3ff71bSJunchao Zhang 
2678c3ff71bSJunchao Zhang   PetscFunctionBegin;
2688c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
269152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
270152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
271152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2728c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2738c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
274152b3e56SJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
275152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
276152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
277152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
278bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
279152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
2808c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2818c3ff71bSJunchao Zhang }
2828c3ff71bSJunchao Zhang 
2838c3ff71bSJunchao Zhang /* z = A^T x + y */
2848c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2858c3ff71bSJunchao Zhang {
2868c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2878c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
288152b3e56SJunchao Zhang   Mat                              B;
289152b3e56SJunchao Zhang   const char                       *mode;
290152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
291152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
2928c3ff71bSJunchao Zhang 
2938c3ff71bSJunchao Zhang   PetscFunctionBegin;
2948c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
295152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
296152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
297152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
2988c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
299152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
300152b3e56SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateTranspose(A);CHKERRQ(ierr);
301152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->At;
302152b3e56SJunchao Zhang     mode = "N";
303152b3e56SJunchao Zhang   } else {
304152b3e56SJunchao Zhang     B    = A;
305152b3e56SJunchao Zhang     mode = "T";
306152b3e56SJunchao Zhang   }
307152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
308152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
309152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
310152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
311152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
312bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
313152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
3148c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3158c3ff71bSJunchao Zhang }
3168c3ff71bSJunchao Zhang 
3178c3ff71bSJunchao Zhang /* z = A^H x + y */
3188c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
3198c3ff71bSJunchao Zhang {
3208c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
3218c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
322152b3e56SJunchao Zhang   Mat                              B;
323152b3e56SJunchao Zhang   const char                       *mode;
324152b3e56SJunchao Zhang   ConstPetscScalarKokkosView       xv,yv;
325152b3e56SJunchao Zhang   PetscScalarKokkosView            zv;
3268c3ff71bSJunchao Zhang 
3278c3ff71bSJunchao Zhang   PetscFunctionBegin;
3288c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329152b3e56SJunchao Zhang   ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr);
330152b3e56SJunchao Zhang   ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr);
331152b3e56SJunchao Zhang   ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr);
3328c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
333152b3e56SJunchao Zhang   if (A->form_explicit_transpose) {
334*86a27549SJunchao Zhang     ierr = MatSeqAIJKokkosGenerateHermitian(A);CHKERRQ(ierr);
335152b3e56SJunchao Zhang     B    = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->Ah;
336152b3e56SJunchao Zhang     mode = "N";
337152b3e56SJunchao Zhang   } else {
338152b3e56SJunchao Zhang     B    = A;
339152b3e56SJunchao Zhang     mode = "C";
340152b3e56SJunchao Zhang   }
341152b3e56SJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
342152b3e56SJunchao Zhang   KokkosSparse::spmv(mode,1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
343152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr);
344152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr);
345152b3e56SJunchao Zhang   ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr);
346bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
347152b3e56SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr);
348152b3e56SJunchao Zhang   PetscFunctionReturn(0);
349152b3e56SJunchao Zhang }
350152b3e56SJunchao Zhang 
351152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)
352152b3e56SJunchao Zhang {
353152b3e56SJunchao Zhang   PetscErrorCode            ierr;
354152b3e56SJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
355152b3e56SJunchao Zhang 
356152b3e56SJunchao Zhang   PetscFunctionBegin;
357152b3e56SJunchao Zhang   switch (op) {
358152b3e56SJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
359152b3e56SJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
360152b3e56SJunchao Zhang       if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);}
361152b3e56SJunchao Zhang       A->form_explicit_transpose = flg;
362152b3e56SJunchao Zhang       break;
363152b3e56SJunchao Zhang     default:
364152b3e56SJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
365152b3e56SJunchao Zhang       break;
366152b3e56SJunchao Zhang   }
3678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3688c3ff71bSJunchao Zhang }
3698c3ff71bSJunchao Zhang 
3703d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3718c3ff71bSJunchao Zhang {
3728c3ff71bSJunchao Zhang   PetscErrorCode ierr;
3738c3ff71bSJunchao Zhang   Mat            B;
374c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
3758c3ff71bSJunchao Zhang 
3768c3ff71bSJunchao Zhang   PetscFunctionBegin;
377a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
3788c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
3798c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3808c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
3818c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3828c3ff71bSJunchao Zhang   }
3838c3ff71bSJunchao Zhang 
3848c3ff71bSJunchao Zhang   B    = *newmat;
3858c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
386*86a27549SJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */
387*86a27549SJunchao Zhang 
3888c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
3890d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
390f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
391c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
392c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
393c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3968c3ff71bSJunchao Zhang }
3978c3ff71bSJunchao Zhang 
3988c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
3998c3ff71bSJunchao Zhang {
4008c3ff71bSJunchao Zhang   PetscErrorCode ierr;
4018c3ff71bSJunchao Zhang 
4028c3ff71bSJunchao Zhang   PetscFunctionBegin;
4038c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
4048c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
4058c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4068c3ff71bSJunchao Zhang }
4078c3ff71bSJunchao Zhang 
4088c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
4098c3ff71bSJunchao Zhang {
4108c3ff71bSJunchao Zhang   PetscErrorCode             ierr;
411*86a27549SJunchao Zhang   Mat_SeqAIJKokkos           *aijkok;
4128c3ff71bSJunchao Zhang 
4138c3ff71bSJunchao Zhang   PetscFunctionBegin;
414*86a27549SJunchao Zhang   if (A->factortype == MAT_FACTOR_NONE) {
415*86a27549SJunchao Zhang     aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
4168f7e8f9dSMark Adams     if (aijkok) {
4178f7e8f9dSMark Adams       if (aijkok->device_mat_d.data()) {
418a587d139SMark         delete aijkok->colmap_d;
419a587d139SMark         delete aijkok->i_uncompressed_d;
420a587d139SMark       }
4218f7e8f9dSMark Adams       if (aijkok->diag_d) delete aijkok->diag_d;
4228f7e8f9dSMark Adams     }
4238c3ff71bSJunchao Zhang     delete aijkok;
424*86a27549SJunchao Zhang   } else {
425*86a27549SJunchao Zhang     delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr);
426*86a27549SJunchao Zhang   }
427f0cf5187SStefano Zampini   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
428152b3e56SJunchao Zhang   ierr     = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
429152b3e56SJunchao Zhang   A->spptr = NULL;
4308c3ff71bSJunchao Zhang   ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
4318c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4328c3ff71bSJunchao Zhang }
4338c3ff71bSJunchao Zhang 
434*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
435*86a27549SJunchao Zhang {
436*86a27549SJunchao Zhang   PetscErrorCode ierr;
437*86a27549SJunchao Zhang 
438*86a27549SJunchao Zhang   PetscFunctionBegin;
439*86a27549SJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
440*86a27549SJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
441*86a27549SJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
442*86a27549SJunchao Zhang   PetscFunctionReturn(0);
443*86a27549SJunchao Zhang }
444*86a27549SJunchao Zhang 
445a3f881fbSStefano Zampini #if 0
446a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
447a3f881fbSStefano Zampini {
448a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
449a3f881fbSStefano Zampini 
450a3f881fbSStefano Zampini   PetscFunctionBegin;
451a3f881fbSStefano Zampini   delete kh;
452a3f881fbSStefano Zampini   PetscFunctionReturn(0);
453a3f881fbSStefano Zampini }
454a3f881fbSStefano Zampini 
455a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
456a3f881fbSStefano Zampini {
457a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
458a3f881fbSStefano Zampini   Mat                  A,B;
459a3f881fbSStefano Zampini   MatProductType       ptype;
460a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
461a3f881fbSStefano Zampini   bool                 tA,tB;
462a3f881fbSStefano Zampini   PetscErrorCode       ierr;
463a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
464a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
465a3f881fbSStefano Zampini 
466a3f881fbSStefano Zampini   PetscFunctionBegin;
467a3f881fbSStefano Zampini   MatCheckProduct(C,1);
468a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
469a3f881fbSStefano Zampini   A = product->A;
470a3f881fbSStefano Zampini   B = product->B;
471a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
472a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
473a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
474a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
475a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
476a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
477a3f881fbSStefano Zampini   ptype = product->type;
478a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
479a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
480a3f881fbSStefano Zampini   switch (ptype) {
481a3f881fbSStefano Zampini   case MATPRODUCT_AB:
482a3f881fbSStefano Zampini     tA = false;
483a3f881fbSStefano Zampini     tB = false;
484a3f881fbSStefano Zampini     break;
485a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
486a3f881fbSStefano Zampini     tA = true;
487a3f881fbSStefano Zampini     tB = false;
488a3f881fbSStefano Zampini     break;
489a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
490a3f881fbSStefano Zampini     tA = false;
491a3f881fbSStefano Zampini     tB = true;
492a3f881fbSStefano Zampini     break;
493a3f881fbSStefano Zampini   default:
494a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
495a3f881fbSStefano Zampini   }
496a3f881fbSStefano Zampini 
497a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
498a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
499a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
500a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
501a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
502a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
503a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
504a3f881fbSStefano Zampini   c->reallocs         = 0;
505a3f881fbSStefano Zampini   C->info.mallocs    += 0;
506a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
507a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
508a3f881fbSStefano Zampini   C->num_ass++;
509a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
510a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
511a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
512a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
513a3f881fbSStefano Zampini   PetscFunctionReturn(0);
514a3f881fbSStefano Zampini }
515a3f881fbSStefano Zampini 
516a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
517a3f881fbSStefano Zampini {
518a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
519a3f881fbSStefano Zampini   Mat                  A,B;
520a3f881fbSStefano Zampini   MatProductType       ptype;
521a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
522a3f881fbSStefano Zampini   PetscInt             m,n,k;
523a3f881fbSStefano Zampini   bool                 tA,tB;
524a3f881fbSStefano Zampini   PetscErrorCode       ierr;
525a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
526a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
527a3f881fbSStefano Zampini 
528a3f881fbSStefano Zampini   PetscFunctionBegin;
529a3f881fbSStefano Zampini   MatCheckProduct(C,1);
530a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
531a3f881fbSStefano Zampini   A = product->A;
532a3f881fbSStefano Zampini   B = product->B;
533a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
534a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
535a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
536a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
537a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
538a3f881fbSStefano Zampini   ptype = product->type;
539a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
540a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
541a3f881fbSStefano Zampini   switch (ptype) {
542a3f881fbSStefano Zampini   case MATPRODUCT_AB:
543a3f881fbSStefano Zampini     tA = false;
544a3f881fbSStefano Zampini     tB = false;
545a3f881fbSStefano Zampini     m = A->rmap->n;
546a3f881fbSStefano Zampini     n = B->cmap->n;
547a3f881fbSStefano Zampini     break;
548a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
549a3f881fbSStefano Zampini     tA = true;
550a3f881fbSStefano Zampini     tB = false;
551a3f881fbSStefano Zampini     m = A->cmap->n;
552a3f881fbSStefano Zampini     n = B->cmap->n;
553a3f881fbSStefano Zampini     break;
554a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
555a3f881fbSStefano Zampini     tA = false;
556a3f881fbSStefano Zampini     tB = true;
557a3f881fbSStefano Zampini     m = A->rmap->n;
558a3f881fbSStefano Zampini     n = B->rmap->n;
559a3f881fbSStefano Zampini     break;
560a3f881fbSStefano Zampini   default:
561a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
562a3f881fbSStefano Zampini   }
563a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
564a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
565a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
566a3f881fbSStefano Zampini 
567a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
568a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
569a3f881fbSStefano Zampini   kh->set_team_work_size(16);
570a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
571a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
572a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
573a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
574a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
575a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
576a3f881fbSStefano Zampini 
577a3f881fbSStefano Zampini   /////////////////////////////////////
578a3f881fbSStefano Zampini   // TODO JZ
579a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
580a3f881fbSStefano Zampini   C->spptr = ckok;
581a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
582a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
583a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
584a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
585a3f881fbSStefano Zampini   //////////////////////////////////////
586a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
587a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
588a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
589a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
590a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
591a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
592a3f881fbSStefano Zampini   ////////////////////////////////////
593a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
594a3f881fbSStefano Zampini   ////////////////////////////////////
595a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
596a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
597a3f881fbSStefano Zampini   c->maxnz = c->nz;
598a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
599a3f881fbSStefano Zampini   c->rmax = 0;
600a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
601a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
602a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
603a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
604a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
605a3f881fbSStefano Zampini   }
606a3f881fbSStefano Zampini 
607a3f881fbSStefano Zampini   C->nonzerostate++;
608a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
609a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
610a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
611a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
612a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
613a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
614a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
615a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
616a3f881fbSStefano Zampini 
617a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
618a3f881fbSStefano Zampini   C->product->data = kh;
619a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
620a3f881fbSStefano Zampini   PetscFunctionReturn(0);
621a3f881fbSStefano Zampini }
622a3f881fbSStefano Zampini 
623a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
624a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
625a3f881fbSStefano Zampini {
626a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
627a3f881fbSStefano Zampini   PetscErrorCode ierr;
628a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
629a3f881fbSStefano Zampini 
630a3f881fbSStefano Zampini   PetscFunctionBegin;
631a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
632a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
633a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
634a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
635a3f881fbSStefano Zampini   }
636a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
637a3f881fbSStefano Zampini     switch (product->type) {
638a3f881fbSStefano Zampini     case MATPRODUCT_AB:
639a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
640a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
641a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
642a3f881fbSStefano Zampini       break;
643a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
644a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
645a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
646a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
647a3f881fbSStefano Zampini       break;
648a3f881fbSStefano Zampini     default:
649a3f881fbSStefano Zampini       break;
650a3f881fbSStefano Zampini     }
651a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
652a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
653a3f881fbSStefano Zampini   }
654a3f881fbSStefano Zampini   PetscFunctionReturn(0);
655a3f881fbSStefano Zampini }
656a3f881fbSStefano Zampini #endif
657a3f881fbSStefano Zampini 
658a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
659a587d139SMark {
660a587d139SMark   PetscErrorCode    ierr;
661a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
662a587d139SMark   PetscFunctionBegin;
663*86a27549SJunchao Zhang   if (aijkok && aijkok->device_mat_d.data()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
664a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
665a587d139SMark   PetscFunctionReturn(0);
666a587d139SMark }
667a587d139SMark 
668f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
669f0cf5187SStefano Zampini {
670f0cf5187SStefano Zampini   PetscErrorCode   ierr;
671f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
672f0cf5187SStefano Zampini 
673f0cf5187SStefano Zampini   PetscFunctionBegin;
674f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
675f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
676f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
677f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
678f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
679f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
680f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
681f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
682f0cf5187SStefano Zampini   PetscFunctionReturn(0);
683f0cf5187SStefano Zampini }
684f0cf5187SStefano Zampini 
685a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
686a587d139SMark {
687a587d139SMark   PetscErrorCode   ierr;
688a587d139SMark   PetscBool        both = PETSC_FALSE;
689a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
690a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
691a587d139SMark 
692a587d139SMark   PetscFunctionBegin;
693a587d139SMark   if (aijkok && aijkok->a_d.data()) {
694f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
695a587d139SMark     both = PETSC_TRUE;
696a587d139SMark   }
697a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
698a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
699a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
700a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
701a587d139SMark   PetscFunctionReturn(0);
702a587d139SMark }
703a587d139SMark 
704f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
705a587d139SMark {
706a587d139SMark   PetscErrorCode ierr;
707a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
708a587d139SMark 
709a587d139SMark   PetscFunctionBegin;
710f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
711a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
712a587d139SMark     PetscFunctionReturn(0);
713a587d139SMark   }
714f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
715a587d139SMark     PetscBool e;
716a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
717a587d139SMark     if (e) {
718a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
719f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
720a587d139SMark     }
721a587d139SMark   }
722a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
723a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
724a587d139SMark     PetscFunctionReturn(0);
725a587d139SMark   } else {
726a587d139SMark     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
727a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
728a587d139SMark       PetscFunctionReturn(0);
729f0cf5187SStefano Zampini     } else {
730a587d139SMark       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
731f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
732a587d139SMark     }
733a587d139SMark     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
734a587d139SMark     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
735a587d139SMark     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
736f0cf5187SStefano Zampini       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
737f0cf5187SStefano Zampini       Y->offloadmask = PETSC_OFFLOAD_GPU;
738f0cf5187SStefano Zampini       ierr = WaitForKokkos();CHKERRQ(ierr);
739a587d139SMark       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
740f0cf5187SStefano Zampini       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
741f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
742a587d139SMark     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
743a587d139SMark   }
744a587d139SMark   PetscFunctionReturn(0);
745a587d139SMark }
746a587d139SMark 
747*86a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
7488f7e8f9dSMark Adams {
7498f7e8f9dSMark Adams   PetscErrorCode ierr;
7508f7e8f9dSMark Adams 
7518f7e8f9dSMark Adams   PetscFunctionBegin;
7528f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
7538f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
7548f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
7558f7e8f9dSMark Adams   PetscFunctionReturn(0);
7568f7e8f9dSMark Adams }
7578f7e8f9dSMark Adams 
7588c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
7598c3ff71bSJunchao Zhang {
7608c3ff71bSJunchao Zhang   PetscFunctionBegin;
7616f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
7626f3d89d0SStefano Zampini 
763a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
7648c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
7658c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
7668c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
767a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
768f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
769a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
770a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
7718c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
7728c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
7738c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
7748c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
7758c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
7768c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
777152b3e56SJunchao Zhang   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
7788c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
7798c3ff71bSJunchao Zhang }
7808c3ff71bSJunchao Zhang 
7818c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
782152b3e56SJunchao Zhang /*@C
7838c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
7848c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
7858c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
7868c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
7878c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
7888c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
7898c3ff71bSJunchao Zhang 
7908c3ff71bSJunchao Zhang    Collective
7918c3ff71bSJunchao Zhang 
7928c3ff71bSJunchao Zhang    Input Parameters:
7938c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
7948c3ff71bSJunchao Zhang .  m - number of rows
7958c3ff71bSJunchao Zhang .  n - number of columns
7968c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
7978c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
7988c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
7998c3ff71bSJunchao Zhang 
8008c3ff71bSJunchao Zhang    Output Parameter:
8018c3ff71bSJunchao Zhang .  A - the matrix
8028c3ff71bSJunchao Zhang 
8038c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
8048c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
8058c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
8068c3ff71bSJunchao Zhang 
8078c3ff71bSJunchao Zhang    Notes:
8088c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
8098c3ff71bSJunchao Zhang 
8108c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
8118c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
8128c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
8138c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
8148c3ff71bSJunchao Zhang 
8158c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
8168c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
8178c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
8188c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
8198c3ff71bSJunchao Zhang 
8208c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
8218c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
8228c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
8238c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
8248c3ff71bSJunchao Zhang 
8258c3ff71bSJunchao Zhang    Level: intermediate
8268c3ff71bSJunchao Zhang 
8278c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
8288c3ff71bSJunchao Zhang @*/
8298c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
8308c3ff71bSJunchao Zhang {
8318c3ff71bSJunchao Zhang   PetscErrorCode ierr;
8328c3ff71bSJunchao Zhang 
8338c3ff71bSJunchao Zhang   PetscFunctionBegin;
8348c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
8358c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
8368c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
8378c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
8388c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
8398c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
8408c3ff71bSJunchao Zhang }
841930e68a5SMark Adams 
842930e68a5SMark Adams // factorizations
8438f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
8448f7e8f9dSMark Adams //
8458f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
8468f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
8478f7e8f9dSMark Adams //
8488f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
849930e68a5SMark Adams {
8508f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
8518f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
8528f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
853930e68a5SMark Adams   PetscErrorCode     ierr;
8548f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
8558f7e8f9dSMark Adams   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
8568f7e8f9dSMark Adams   const PetscScalar  *aa_d = aijkok->a_d.data();
8578f7e8f9dSMark Adams   PetscScalar        *ba_d = baijkok->a_d.data();
8588f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
8598f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
8608f7e8f9dSMark Adams   PetscContainer     container;
861930e68a5SMark Adams 
862930e68a5SMark Adams   PetscFunctionBegin;
8638f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
8648f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
8658f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
8668f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
8678f7e8f9dSMark Adams   if (container) {
8688f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
8698f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
8708f7e8f9dSMark Adams     Nf = (*pNf)%1000;
8718f7e8f9dSMark Adams     nv = (*pNf)/1000;
8728f7e8f9dSMark Adams     if (nv>0) nVec = nv;
8738f7e8f9dSMark Adams   } else Nf = 1;
8748f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
8758f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
8768f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
8778f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
8788f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
8798f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
8808f7e8f9dSMark Adams #endif
8818f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
8828f7e8f9dSMark Adams   {
8838f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
8848f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
8858f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
8868f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
8878f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
8888f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
8898f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
8908f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
8918f7e8f9dSMark Adams     size_t flops_h = 0.0;
8928f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
8938f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
8948f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
8958f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
8968f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
8978f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
8988f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
8998f7e8f9dSMark Adams     // Fill A --> fact
9008f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
9018f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
9028f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
9038f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
9048f7e8f9dSMark Adams         // zero rows of B
9058f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9068f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
9078f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
9088f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
9098f7e8f9dSMark Adams             /* zero (unfactored row) */
9108f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
9118f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
9128f7e8f9dSMark Adams           });
9138f7e8f9dSMark Adams         // copy A into B
9148f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
9158f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
9168f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
9178f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
9188f7e8f9dSMark Adams             /* load in initial (unfactored row) */
9198f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
9208f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
9218f7e8f9dSMark Adams               PetscScalar vala = av[j];
9228f7e8f9dSMark Adams               if (colb == rowb) {
9238f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
9248f7e8f9dSMark Adams               } else {
9258f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9268f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
9278f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
9288f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
9298f7e8f9dSMark Adams                   if (pbj[j] == colb) {
9308f7e8f9dSMark Adams                     pba[j] = vala;
9318f7e8f9dSMark Adams                     set++;
9328f7e8f9dSMark Adams                     break;
9338f7e8f9dSMark Adams                   }
9348f7e8f9dSMark Adams                 }
9358f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
9368f7e8f9dSMark Adams               }
9378f7e8f9dSMark Adams             }
9388f7e8f9dSMark Adams           });
9398f7e8f9dSMark Adams       });
9408f7e8f9dSMark Adams     Kokkos::fence();
941930e68a5SMark Adams 
9428f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
9438f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
9448f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
9458f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
9468f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
9478f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
9488f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
9498f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
9508f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
9518f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
9528f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
9538f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
9548f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
9558f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
9568f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
9578f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
9588f7e8f9dSMark Adams               else {
9598f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
9608f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
9618f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
9628f7e8f9dSMark Adams                 size_t         st_idx;
9638f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
9648f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
9658f7e8f9dSMark Adams                 // get column, there has got to be a better way
9668f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
9678f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
9688f7e8f9dSMark Adams                     if (pjL[j] == ii) {
9698f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
9708f7e8f9dSMark Adams                       idx = j; // output
9718f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
9728f7e8f9dSMark Adams                     }
9738f7e8f9dSMark Adams                   }, st_idx);
9748f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
9758f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
9768f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
9778f7e8f9dSMark Adams                   // U(i+1,:end)
9788f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
9798f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
9808f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
9818f7e8f9dSMark Adams                       if (col==myk) {
9828f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
9838f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
9848f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
9858f7e8f9dSMark Adams                       } else {
9868f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
9878f7e8f9dSMark Adams                         PetscInt       high, low;
9888f7e8f9dSMark Adams                         const PetscInt *startj;
9898f7e8f9dSMark Adams                         if (col<myk) { // L
9908f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
9918f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
9928f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
9938f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
9948f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
9958f7e8f9dSMark Adams                         } else {
9968f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
9978f7e8f9dSMark Adams                           start = ba_d + idx;
9988f7e8f9dSMark Adams                           startj= bj_d + idx;
9998f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
10008f7e8f9dSMark Adams                         }
10018f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
10028f7e8f9dSMark Adams                         low  = 0;
10038f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
10048f7e8f9dSMark Adams                         while (high-low > 5) {
10058f7e8f9dSMark Adams                           int t = (low+high)/2;
10068f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
10078f7e8f9dSMark Adams                           else                 low  = t;
10088f7e8f9dSMark Adams                         }
10098f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
10108f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
10118f7e8f9dSMark Adams                         }
10128f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
10138f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
10148f7e8f9dSMark Adams                       }
10158f7e8f9dSMark Adams                     });
10168f7e8f9dSMark Adams                 }
10178f7e8f9dSMark Adams               }
10188f7e8f9dSMark Adams             });
10198f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
10208f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
10218f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
10228f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
10238f7e8f9dSMark Adams       });
10248f7e8f9dSMark Adams     Kokkos::fence();
10258f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
10268f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
10278f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
10288f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
10298f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
10308f7e8f9dSMark Adams #endif
10318f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
10328f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
10338f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
10348f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
10358f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
10368f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
10378f7e8f9dSMark Adams             if (i < end) {
10388f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
10398f7e8f9dSMark Adams               *pv = 1.0/(*pv);
10408f7e8f9dSMark Adams             }
10418f7e8f9dSMark Adams           });
10428f7e8f9dSMark Adams       });
10438f7e8f9dSMark Adams   }
10448f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
10458f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
10468f7e8f9dSMark Adams #endif
10478f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
10488f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
10498f7e8f9dSMark Adams 
10508f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
10518f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
10528f7e8f9dSMark Adams   if (b->inode.size) {
10538f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
10548f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
10558f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
10568f7e8f9dSMark Adams   } else {
10578f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
10588f7e8f9dSMark Adams   }
10598f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
10608f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
10618f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
10628f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
10638f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
10648f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
10658f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
10668f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
10678f7e8f9dSMark Adams 
1068930e68a5SMark Adams   PetscFunctionReturn(0);
1069930e68a5SMark Adams }
1070930e68a5SMark Adams 
1071*86a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1072930e68a5SMark Adams {
1073930e68a5SMark Adams   PetscErrorCode   ierr;
1074930e68a5SMark Adams 
1075930e68a5SMark Adams   PetscFunctionBegin;
1076930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1077*86a27549SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
1078*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1079*86a27549SJunchao Zhang }
1080*86a27549SJunchao Zhang 
1081*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A)
1082*86a27549SJunchao Zhang {
1083*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1084*86a27549SJunchao Zhang 
1085*86a27549SJunchao Zhang   PetscFunctionBegin;
1086*86a27549SJunchao Zhang   if (!factors->sptrsv_symbolic_completed) {
1087*86a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d);
1088*86a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d);
1089*86a27549SJunchao Zhang     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1090*86a27549SJunchao Zhang   }
1091*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1092*86a27549SJunchao Zhang }
1093*86a27549SJunchao Zhang 
1094*86a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */
1095*86a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1096*86a27549SJunchao Zhang {
1097*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1098*86a27549SJunchao Zhang   MatColumnIndexType             n = A->rmap->n;
1099*86a27549SJunchao Zhang 
1100*86a27549SJunchao Zhang   PetscFunctionBegin;
1101*86a27549SJunchao Zhang   if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */
1102*86a27549SJunchao Zhang     /* Update L^T and do sptrsv symbolic */
1103*86a27549SJunchao Zhang     factors->iLt_d = MatRowOffsetKokkosView("factors->iLt_d",n+1);
1104*86a27549SJunchao Zhang     Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */
1105*86a27549SJunchao Zhang     factors->jLt_d = MatColumnIndexKokkosView("factors->jLt_d",factors->jL_d.extent(0));
1106*86a27549SJunchao Zhang     factors->aLt_d = MatValueKokkosView("factors->aLt_d",factors->aL_d.extent(0));
1107*86a27549SJunchao Zhang 
1108*86a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1109*86a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1110*86a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1111*86a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1112*86a27549SJunchao Zhang         n,n,factors->iL_d,factors->jL_d,factors->aL_d,
1113*86a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1114*86a27549SJunchao Zhang 
1115*86a27549SJunchao Zhang     /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices.
1116*86a27549SJunchao Zhang       We have to sort the indices, until KK provides finer control options.
1117*86a27549SJunchao Zhang     */
1118*86a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1119*86a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1120*86a27549SJunchao Zhang         factors->iLt_d,factors->jLt_d,factors->aLt_d);
1121*86a27549SJunchao Zhang 
1122*86a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d);
1123*86a27549SJunchao Zhang 
1124*86a27549SJunchao Zhang     /* Update U^T and do sptrsv symbolic */
1125*86a27549SJunchao Zhang     factors->iUt_d = MatRowOffsetKokkosView("factors->iUt_d",n+1);
1126*86a27549SJunchao Zhang     Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */
1127*86a27549SJunchao Zhang     factors->jUt_d = MatColumnIndexKokkosView("factors->jUt_d",factors->jU_d.extent(0));
1128*86a27549SJunchao Zhang     factors->aUt_d = MatValueKokkosView("factors->aUt_d",factors->aU_d.extent(0));
1129*86a27549SJunchao Zhang 
1130*86a27549SJunchao Zhang     KokkosKernels::Impl::transpose_matrix<
1131*86a27549SJunchao Zhang       ConstMatRowOffsetKokkosView,ConstMatColumnIndexKokkosView,ConstMatValueKokkosView,
1132*86a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView,
1133*86a27549SJunchao Zhang       MatRowOffsetKokkosView,DefaultExecutionSpace>(
1134*86a27549SJunchao Zhang         n,n,factors->iU_d, factors->jU_d, factors->aU_d,
1135*86a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1136*86a27549SJunchao Zhang 
1137*86a27549SJunchao Zhang     /* Sort indices. See comments above */
1138*86a27549SJunchao Zhang     KokkosKernels::Impl::sort_crs_matrix<DefaultExecutionSpace,
1139*86a27549SJunchao Zhang       MatRowOffsetKokkosView,MatColumnIndexKokkosView,MatValueKokkosView>(
1140*86a27549SJunchao Zhang         factors->iUt_d,factors->jUt_d,factors->aUt_d);
1141*86a27549SJunchao Zhang 
1142*86a27549SJunchao Zhang     KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d);
1143*86a27549SJunchao Zhang     factors->transpose_updated = PETSC_TRUE;
1144*86a27549SJunchao Zhang   }
1145*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1146*86a27549SJunchao Zhang }
1147*86a27549SJunchao Zhang 
1148*86a27549SJunchao Zhang /* Solve Ax = b, with A = LU */
1149*86a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x)
1150*86a27549SJunchao Zhang {
1151*86a27549SJunchao Zhang   PetscErrorCode                 ierr;
1152*86a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
1153*86a27549SJunchao Zhang   PetscScalarKokkosView          xv;
1154*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1155*86a27549SJunchao Zhang 
1156*86a27549SJunchao Zhang   PetscFunctionBegin;
1157*86a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr);
1158*86a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1159*86a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1160*86a27549SJunchao Zhang   /* Solve L tmpv = b */
1161*86a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector);
1162*86a27549SJunchao Zhang   /* Solve Ux = tmpv */
1163*86a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv);
1164*86a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1165*86a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1166*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1167*86a27549SJunchao Zhang }
1168*86a27549SJunchao Zhang 
1169*86a27549SJunchao Zhang /* Solve A^T x = b, with A^T = U^T L^T */
1170*86a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x)
1171*86a27549SJunchao Zhang {
1172*86a27549SJunchao Zhang   PetscErrorCode                 ierr;
1173*86a27549SJunchao Zhang   ConstPetscScalarKokkosView     bv;
1174*86a27549SJunchao Zhang   PetscScalarKokkosView          xv;
1175*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr;
1176*86a27549SJunchao Zhang 
1177*86a27549SJunchao Zhang   PetscFunctionBegin;
1178*86a27549SJunchao Zhang   ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr);
1179*86a27549SJunchao Zhang   ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr);
1180*86a27549SJunchao Zhang   ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr);
1181*86a27549SJunchao Zhang   /* Solve U^T tmpv = b */
1182*86a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector);
1183*86a27549SJunchao Zhang 
1184*86a27549SJunchao Zhang   /* Solve L^T x = tmpv */
1185*86a27549SJunchao Zhang   KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv);
1186*86a27549SJunchao Zhang   ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr);
1187*86a27549SJunchao Zhang   ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr);
1188*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1189*86a27549SJunchao Zhang }
1190*86a27549SJunchao Zhang 
1191*86a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info)
1192*86a27549SJunchao Zhang {
1193*86a27549SJunchao Zhang   PetscErrorCode                 ierr;
1194*86a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1195*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1196*86a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
1197*86a27549SJunchao Zhang 
1198*86a27549SJunchao Zhang   PetscFunctionBegin;
1199*86a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1200*86a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,aijkok->a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d);
1201*86a27549SJunchao Zhang 
1202*86a27549SJunchao Zhang   B->assembled                       = PETSC_TRUE;
1203*86a27549SJunchao Zhang   B->preallocated                    = PETSC_TRUE;
1204*86a27549SJunchao Zhang   B->ops->solve                      = MatSolve_SeqAIJKokkos;
1205*86a27549SJunchao Zhang   B->ops->solvetranspose             = MatSolveTranspose_SeqAIJKokkos;
1206*86a27549SJunchao Zhang   B->ops->matsolve                   = NULL;
1207*86a27549SJunchao Zhang   B->ops->matsolvetranspose          = NULL;
1208*86a27549SJunchao Zhang   B->offloadmask                     = PETSC_OFFLOAD_GPU;
1209*86a27549SJunchao Zhang 
1210*86a27549SJunchao Zhang   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
1211*86a27549SJunchao Zhang   factors->transpose_updated         = PETSC_FALSE;
1212*86a27549SJunchao Zhang   factors->sptrsv_symbolic_completed = PETSC_FALSE;
1213*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1214*86a27549SJunchao Zhang }
1215*86a27549SJunchao Zhang 
1216*86a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1217*86a27549SJunchao Zhang {
1218*86a27549SJunchao Zhang   PetscErrorCode                 ierr;
1219*86a27549SJunchao Zhang   Mat_SeqAIJKokkos               *aijkok;
1220*86a27549SJunchao Zhang   Mat_SeqAIJ                     *b;
1221*86a27549SJunchao Zhang   Mat_SeqAIJKokkosTriFactors     *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr;
1222*86a27549SJunchao Zhang   PetscInt                       fill_lev = info->levels;
1223*86a27549SJunchao Zhang   PetscInt                       nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU;
1224*86a27549SJunchao Zhang   PetscInt                       n = A->rmap->n;
1225*86a27549SJunchao Zhang 
1226*86a27549SJunchao Zhang   PetscFunctionBegin;
1227*86a27549SJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1228*86a27549SJunchao Zhang   /* Rebuild factors */
1229*86a27549SJunchao Zhang   if (factors) {factors->Destroy();} /* Destroy the old if it exists */
1230*86a27549SJunchao Zhang   else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);}
1231*86a27549SJunchao Zhang 
1232*86a27549SJunchao Zhang   /* Create a spiluk handle and then do symbolic factorization */
1233*86a27549SJunchao Zhang   nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA);
1234*86a27549SJunchao Zhang   factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU);
1235*86a27549SJunchao Zhang 
1236*86a27549SJunchao Zhang   auto spiluk_handle = factors->kh.get_spiluk_handle();
1237*86a27549SJunchao Zhang 
1238*86a27549SJunchao Zhang   Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */
1239*86a27549SJunchao Zhang   Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL());
1240*86a27549SJunchao Zhang   Kokkos::realloc(factors->iU_d,n+1);
1241*86a27549SJunchao Zhang   Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU());
1242*86a27549SJunchao Zhang 
1243*86a27549SJunchao Zhang   aijkok = (Mat_SeqAIJKokkos*)A->spptr;
1244*86a27549SJunchao Zhang   KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,aijkok->i_d,aijkok->j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d);
1245*86a27549SJunchao Zhang   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
1246*86a27549SJunchao Zhang 
1247*86a27549SJunchao Zhang   Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
1248*86a27549SJunchao Zhang   Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU());
1249*86a27549SJunchao Zhang   Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */
1250*86a27549SJunchao Zhang   Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU());
1251*86a27549SJunchao Zhang 
1252*86a27549SJunchao Zhang   /* TODO: add options to select sptrsv algorithms */
1253*86a27549SJunchao Zhang   /* Create sptrsv handles for L, U and their transpose */
1254*86a27549SJunchao Zhang  #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
1255*86a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE;
1256*86a27549SJunchao Zhang  #else
1257*86a27549SJunchao Zhang   auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
1258*86a27549SJunchao Zhang  #endif
1259*86a27549SJunchao Zhang 
1260*86a27549SJunchao Zhang   factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */);
1261*86a27549SJunchao Zhang   factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */);
1262*86a27549SJunchao Zhang   factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */);
1263*86a27549SJunchao Zhang   factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */);
1264*86a27549SJunchao Zhang 
1265*86a27549SJunchao Zhang   /* Fill fields of the factor matrix B */
1266*86a27549SJunchao Zhang   ierr  = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1267*86a27549SJunchao Zhang   b     = (Mat_SeqAIJ*)B->data;
1268*86a27549SJunchao Zhang   b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU();
1269*86a27549SJunchao Zhang   B->info.fill_ratio_given  = info->fill;
1270*86a27549SJunchao Zhang   B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA);
1271*86a27549SJunchao Zhang 
1272*86a27549SJunchao Zhang   B->offloadmask            = PETSC_OFFLOAD_GPU;
1273*86a27549SJunchao Zhang   B->ops->lufactornumeric   = MatILUFactorNumeric_SeqAIJKokkos;
1274930e68a5SMark Adams   PetscFunctionReturn(0);
1275930e68a5SMark Adams }
1276930e68a5SMark Adams 
12778f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
12788f7e8f9dSMark Adams {
12798f7e8f9dSMark Adams   PetscErrorCode   ierr;
12808f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
12818f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
1282930e68a5SMark Adams 
12838f7e8f9dSMark Adams   PetscFunctionBegin;
12848f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
12858f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
12868f7e8f9dSMark Adams   // move B data into Kokkos
12878f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
12888f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
12898f7e8f9dSMark Adams   {
12908f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
12918f7e8f9dSMark Adams     if (!baijkok->diag_d) {
12928f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
1293152b3e56SJunchao Zhang       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag));
12948f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
12958f7e8f9dSMark Adams     }
12968f7e8f9dSMark Adams   }
12978f7e8f9dSMark Adams   PetscFunctionReturn(0);
12988f7e8f9dSMark Adams }
12998f7e8f9dSMark Adams 
1300*86a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type)
1301930e68a5SMark Adams {
1302930e68a5SMark Adams   PetscFunctionBegin;
1303930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
1304930e68a5SMark Adams   PetscFunctionReturn(0);
1305930e68a5SMark Adams }
1306930e68a5SMark Adams 
13078f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice
13088f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
13098f7e8f9dSMark Adams {
13108f7e8f9dSMark Adams   PetscFunctionBegin;
13118f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
13128f7e8f9dSMark Adams   PetscFunctionReturn(0);
13138f7e8f9dSMark Adams }
13148f7e8f9dSMark Adams 
1315930e68a5SMark Adams /*MC
1316*86a27549SJunchao Zhang   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1317*86a27549SJunchao Zhang   on a single GPU of type, SeqAIJKokkos, AIJKokkos.
1318930e68a5SMark Adams 
1319930e68a5SMark Adams   Level: beginner
1320930e68a5SMark Adams 
1321*86a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation
1322930e68a5SMark Adams M*/
1323*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
1324930e68a5SMark Adams {
1325930e68a5SMark Adams   PetscErrorCode ierr;
1326930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1327930e68a5SMark Adams 
1328930e68a5SMark Adams   PetscFunctionBegin;
1329930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1330930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1331930e68a5SMark Adams   (*B)->factortype = ftype;
13324ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1333930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1334930e68a5SMark Adams 
13358f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1336930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1337*86a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_TRUE;
1338*86a27549SJunchao Zhang     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
1339*86a27549SJunchao Zhang   } else if (ftype == MAT_FACTOR_ILU) {
1340*86a27549SJunchao Zhang     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1341*86a27549SJunchao Zhang     (*B)->canuseordering         = PETSC_FALSE;
1342*86a27549SJunchao Zhang     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
1343*86a27549SJunchao Zhang   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
1344930e68a5SMark Adams 
1345930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1346*86a27549SJunchao Zhang   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr);
1347930e68a5SMark Adams   PetscFunctionReturn(0);
1348930e68a5SMark Adams }
13498f7e8f9dSMark Adams 
13508f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
13518f7e8f9dSMark Adams {
13528f7e8f9dSMark Adams   PetscErrorCode ierr;
13538f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
13548f7e8f9dSMark Adams 
13558f7e8f9dSMark Adams   PetscFunctionBegin;
13568f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
13578f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
13588f7e8f9dSMark Adams   (*B)->factortype = ftype;
1359f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
13604ac6704cSBarry Smith   ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
13618f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
13628f7e8f9dSMark Adams 
13638f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
13648f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
13658f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
13668f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
13678f7e8f9dSMark Adams 
13688f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
13698f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
13708f7e8f9dSMark Adams   PetscFunctionReturn(0);
13718f7e8f9dSMark Adams }
1372*86a27549SJunchao Zhang 
1373*86a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
1374*86a27549SJunchao Zhang {
1375*86a27549SJunchao Zhang   PetscErrorCode ierr;
1376*86a27549SJunchao Zhang 
1377*86a27549SJunchao Zhang   PetscFunctionBegin;
1378*86a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1379*86a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr);
1380*86a27549SJunchao Zhang   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
1381*86a27549SJunchao Zhang   PetscFunctionReturn(0);
1382*86a27549SJunchao Zhang }
1383*86a27549SJunchao Zhang 
1384