xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 8f7e8f9d40006a7858dc4c7c538cdd867ae8e3c7)
18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h"
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
49f114eb2SJunchao Zhang #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
118c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
128c3ff71bSJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14a587d139SMark #include <petscmat.h>
158c3ff71bSJunchao Zhang 
168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
198c3ff71bSJunchao Zhang {
208c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
21a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
228c3ff71bSJunchao Zhang 
238c3ff71bSJunchao Zhang   PetscFunctionBegin;
248c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
26a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27a587d139SMark   }
288c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
298c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
308c3ff71bSJunchao Zhang }
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
338c3ff71bSJunchao Zhang {
348c3ff71bSJunchao Zhang   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
358c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
368c3ff71bSJunchao Zhang   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
378c3ff71bSJunchao Zhang 
388c3ff71bSJunchao Zhang   PetscFunctionBegin;
39a3f881fbSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
408c3ff71bSJunchao Zhang   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
418c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
428c3ff71bSJunchao Zhang     delete aijkok;
438c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
448c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
458c3ff71bSJunchao Zhang     A->spptr             = aijkok;
468c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
478c3ff71bSJunchao Zhang     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
488c3ff71bSJunchao Zhang   }
498c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
518c3ff71bSJunchao Zhang }
528c3ff71bSJunchao Zhang 
53f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54f0cf5187SStefano Zampini {
55f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56f0cf5187SStefano Zampini 
57f0cf5187SStefano Zampini   PetscFunctionBegin;
58f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61f0cf5187SStefano Zampini     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
63f0cf5187SStefano Zampini   }
64f0cf5187SStefano Zampini   PetscFunctionReturn(0);
65f0cf5187SStefano Zampini }
66f0cf5187SStefano Zampini 
67f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68f0cf5187SStefano Zampini {
69f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70f0cf5187SStefano Zampini   PetscErrorCode ierr;
71f0cf5187SStefano Zampini 
72f0cf5187SStefano Zampini   PetscFunctionBegin;
73f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74f0cf5187SStefano Zampini   *array = a->a;
75f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
76f0cf5187SStefano Zampini   PetscFunctionReturn(0);
77f0cf5187SStefano Zampini }
78f0cf5187SStefano Zampini 
79a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81a587d139SMark {
82a587d139SMark   Mat_SeqAIJKokkos *aijkok;
83a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84a587d139SMark 
85a587d139SMark   PetscFunctionBegin;
86a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89a587d139SMark   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91a587d139SMark   PetscFunctionReturn(0);
92a587d139SMark }
93a587d139SMark 
94a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96a587d139SMark {
97a587d139SMark   Mat_SeqAIJKokkos *aijkok;
98a587d139SMark 
99a587d139SMark   PetscFunctionBegin;
100a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
102a587d139SMark     *d_mat = aijkok->device_mat_d.data();
103a587d139SMark   } else {
104a587d139SMark     PetscErrorCode   ierr;
105a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106a587d139SMark     *d_mat  = NULL;
107a587d139SMark   }
108a587d139SMark   PetscFunctionReturn(0);
109a587d139SMark }
110a587d139SMark 
1118c3ff71bSJunchao Zhang /* y = A x */
1128c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1138c3ff71bSJunchao Zhang {
1148c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1158c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1168c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1178c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1188c3ff71bSJunchao Zhang 
1198c3ff71bSJunchao Zhang   PetscFunctionBegin;
1208c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1228c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1238c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1248c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
1258c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1268c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
1271e566348SJunchao Zhang   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
1291e566348SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1308c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1318c3ff71bSJunchao Zhang }
1328c3ff71bSJunchao Zhang 
1338c3ff71bSJunchao Zhang /* y = A^T x */
1348c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1358c3ff71bSJunchao Zhang {
1368c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1378c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1388c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1398c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1408c3ff71bSJunchao Zhang 
1418c3ff71bSJunchao Zhang   PetscFunctionBegin;
1428c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1438c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1448c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1458c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1468c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
1478c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1488c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1501e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1528c3ff71bSJunchao Zhang }
1538c3ff71bSJunchao Zhang 
1548c3ff71bSJunchao Zhang /* y = A^H x */
1558c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1568c3ff71bSJunchao Zhang {
1578c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1598c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1608c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1618c3ff71bSJunchao Zhang 
1628c3ff71bSJunchao Zhang   PetscFunctionBegin;
1638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1648c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1658c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1668c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1678c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
1688c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1698c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1711e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1738c3ff71bSJunchao Zhang }
1748c3ff71bSJunchao Zhang 
1758c3ff71bSJunchao Zhang /* z = A x + y */
1768c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
1778c3ff71bSJunchao Zhang {
1788c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1808c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1818c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang   PetscFunctionBegin;
1848c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1858c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1868c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1878c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1888c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1898c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1908c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
1918c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1928c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1938c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1951e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1978c3ff71bSJunchao Zhang }
1988c3ff71bSJunchao Zhang 
1998c3ff71bSJunchao Zhang /* z = A^T x + y */
2008c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2018c3ff71bSJunchao Zhang {
2028c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2038c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2048c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2058c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang   PetscFunctionBegin;
2088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2098c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2108c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2118c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2128c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2138c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2148c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
2158c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2168c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2178c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2191e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2218c3ff71bSJunchao Zhang }
2228c3ff71bSJunchao Zhang 
2238c3ff71bSJunchao Zhang /* z = A^H x + y */
2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2258c3ff71bSJunchao Zhang {
2268c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2278c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2288c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2298c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2308c3ff71bSJunchao Zhang 
2318c3ff71bSJunchao Zhang   PetscFunctionBegin;
2328c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2338c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2348c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2358c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2368c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2378c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2388c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
2398c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2418c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2431e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2458c3ff71bSJunchao Zhang }
2468c3ff71bSJunchao Zhang 
2473d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2488c3ff71bSJunchao Zhang {
2498c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2508c3ff71bSJunchao Zhang   Mat            B;
251c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
2528c3ff71bSJunchao Zhang 
2538c3ff71bSJunchao Zhang   PetscFunctionBegin;
254a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
2568c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2578c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
2588c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2598c3ff71bSJunchao Zhang   }
2608c3ff71bSJunchao Zhang 
2618c3ff71bSJunchao Zhang   B    = *newmat;
2628c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2638c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
2648c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2650d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
269c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
2708c3ff71bSJunchao Zhang 
2718c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2738c3ff71bSJunchao Zhang }
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
2768c3ff71bSJunchao Zhang {
2778c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2788c3ff71bSJunchao Zhang 
2798c3ff71bSJunchao Zhang   PetscFunctionBegin;
2808c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2818c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2838c3ff71bSJunchao Zhang }
2848c3ff71bSJunchao Zhang 
2858c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
2868c3ff71bSJunchao Zhang {
2878c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
2888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
291*8f7e8f9dSMark Adams   if (aijkok) {
292*8f7e8f9dSMark Adams     if (aijkok->device_mat_d.data()) {
293a587d139SMark       delete aijkok->colmap_d;
294a587d139SMark       delete aijkok->i_uncompressed_d;
295a587d139SMark     }
296*8f7e8f9dSMark Adams     if (aijkok->diag_d) delete aijkok->diag_d;
297*8f7e8f9dSMark Adams   }
2988c3ff71bSJunchao Zhang   delete aijkok;
299f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
3008c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3018c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3028c3ff71bSJunchao Zhang }
3038c3ff71bSJunchao Zhang 
304a3f881fbSStefano Zampini #if 0
305a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
306a3f881fbSStefano Zampini {
307a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
308a3f881fbSStefano Zampini 
309a3f881fbSStefano Zampini   PetscFunctionBegin;
310a3f881fbSStefano Zampini   delete kh;
311a3f881fbSStefano Zampini   PetscFunctionReturn(0);
312a3f881fbSStefano Zampini }
313a3f881fbSStefano Zampini 
314a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
315a3f881fbSStefano Zampini {
316a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
317a3f881fbSStefano Zampini   Mat                  A,B;
318a3f881fbSStefano Zampini   MatProductType       ptype;
319a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
320a3f881fbSStefano Zampini   bool                 tA,tB;
321a3f881fbSStefano Zampini   PetscErrorCode       ierr;
322a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
323a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
324a3f881fbSStefano Zampini 
325a3f881fbSStefano Zampini   PetscFunctionBegin;
326a3f881fbSStefano Zampini   MatCheckProduct(C,1);
327a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
328a3f881fbSStefano Zampini   A = product->A;
329a3f881fbSStefano Zampini   B = product->B;
330a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
331a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
332a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
333a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
334a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
335a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
336a3f881fbSStefano Zampini   ptype = product->type;
337a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
338a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
339a3f881fbSStefano Zampini   switch (ptype) {
340a3f881fbSStefano Zampini   case MATPRODUCT_AB:
341a3f881fbSStefano Zampini     tA = false;
342a3f881fbSStefano Zampini     tB = false;
343a3f881fbSStefano Zampini     break;
344a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
345a3f881fbSStefano Zampini     tA = true;
346a3f881fbSStefano Zampini     tB = false;
347a3f881fbSStefano Zampini     break;
348a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
349a3f881fbSStefano Zampini     tA = false;
350a3f881fbSStefano Zampini     tB = true;
351a3f881fbSStefano Zampini     break;
352a3f881fbSStefano Zampini   default:
353a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
354a3f881fbSStefano Zampini   }
355a3f881fbSStefano Zampini 
356a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
357a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
358a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
359a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
360a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
361a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
362a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
363a3f881fbSStefano Zampini   c->reallocs         = 0;
364a3f881fbSStefano Zampini   C->info.mallocs    += 0;
365a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
366a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
367a3f881fbSStefano Zampini   C->num_ass++;
368a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
369a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
370a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
371a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
372a3f881fbSStefano Zampini   PetscFunctionReturn(0);
373a3f881fbSStefano Zampini }
374a3f881fbSStefano Zampini 
375a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
376a3f881fbSStefano Zampini {
377a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
378a3f881fbSStefano Zampini   Mat                  A,B;
379a3f881fbSStefano Zampini   MatProductType       ptype;
380a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
381a3f881fbSStefano Zampini   PetscInt             m,n,k;
382a3f881fbSStefano Zampini   bool                 tA,tB;
383a3f881fbSStefano Zampini   PetscErrorCode       ierr;
384a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
385a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
386a3f881fbSStefano Zampini 
387a3f881fbSStefano Zampini   PetscFunctionBegin;
388a3f881fbSStefano Zampini   MatCheckProduct(C,1);
389a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
390a3f881fbSStefano Zampini   A = product->A;
391a3f881fbSStefano Zampini   B = product->B;
392a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
393a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
394a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
395a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
396a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
397a3f881fbSStefano Zampini   ptype = product->type;
398a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
399a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
400a3f881fbSStefano Zampini   switch (ptype) {
401a3f881fbSStefano Zampini   case MATPRODUCT_AB:
402a3f881fbSStefano Zampini     tA = false;
403a3f881fbSStefano Zampini     tB = false;
404a3f881fbSStefano Zampini     m = A->rmap->n;
405a3f881fbSStefano Zampini     n = B->cmap->n;
406a3f881fbSStefano Zampini     break;
407a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
408a3f881fbSStefano Zampini     tA = true;
409a3f881fbSStefano Zampini     tB = false;
410a3f881fbSStefano Zampini     m = A->cmap->n;
411a3f881fbSStefano Zampini     n = B->cmap->n;
412a3f881fbSStefano Zampini     break;
413a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
414a3f881fbSStefano Zampini     tA = false;
415a3f881fbSStefano Zampini     tB = true;
416a3f881fbSStefano Zampini     m = A->rmap->n;
417a3f881fbSStefano Zampini     n = B->rmap->n;
418a3f881fbSStefano Zampini     break;
419a3f881fbSStefano Zampini   default:
420a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
421a3f881fbSStefano Zampini   }
422a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
423a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
424a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
425a3f881fbSStefano Zampini 
426a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
427a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
428a3f881fbSStefano Zampini   kh->set_team_work_size(16);
429a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
430a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
431a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
432a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
433a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
434a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
435a3f881fbSStefano Zampini 
436a3f881fbSStefano Zampini   /////////////////////////////////////
437a3f881fbSStefano Zampini   // TODO JZ
438a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
439a3f881fbSStefano Zampini   C->spptr = ckok;
440a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
441a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
442a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
443a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
444a3f881fbSStefano Zampini   //////////////////////////////////////
445a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
446a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
447a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
448a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
449a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
450a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
451a3f881fbSStefano Zampini   ////////////////////////////////////
452a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
453a3f881fbSStefano Zampini   ////////////////////////////////////
454a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
455a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
456a3f881fbSStefano Zampini   c->maxnz = c->nz;
457a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
458a3f881fbSStefano Zampini   c->rmax = 0;
459a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
460a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
461a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
462a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
463a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
464a3f881fbSStefano Zampini   }
465a3f881fbSStefano Zampini 
466a3f881fbSStefano Zampini   C->nonzerostate++;
467a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
468a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
469a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
470a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
471a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
472a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
473a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
474a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
475a3f881fbSStefano Zampini 
476a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
477a3f881fbSStefano Zampini   C->product->data = kh;
478a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
479a3f881fbSStefano Zampini   PetscFunctionReturn(0);
480a3f881fbSStefano Zampini }
481a3f881fbSStefano Zampini 
482a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
483a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
484a3f881fbSStefano Zampini {
485a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
486a3f881fbSStefano Zampini   PetscErrorCode ierr;
487a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
488a3f881fbSStefano Zampini 
489a3f881fbSStefano Zampini   PetscFunctionBegin;
490a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
491a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
492a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
493a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
494a3f881fbSStefano Zampini   }
495a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
496a3f881fbSStefano Zampini     switch (product->type) {
497a3f881fbSStefano Zampini     case MATPRODUCT_AB:
498a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
499a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
500a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
501a3f881fbSStefano Zampini       break;
502a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
503a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
504a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
505a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
506a3f881fbSStefano Zampini       break;
507a3f881fbSStefano Zampini     default:
508a3f881fbSStefano Zampini       break;
509a3f881fbSStefano Zampini     }
510a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
511a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
512a3f881fbSStefano Zampini   }
513a3f881fbSStefano Zampini   PetscFunctionReturn(0);
514a3f881fbSStefano Zampini }
515a3f881fbSStefano Zampini #endif
516a3f881fbSStefano Zampini 
5178c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
5188c3ff71bSJunchao Zhang {
5198c3ff71bSJunchao Zhang   PetscErrorCode ierr;
5208c3ff71bSJunchao Zhang 
5218c3ff71bSJunchao Zhang   PetscFunctionBegin;
5228c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
5238c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
5248c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5258c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5268c3ff71bSJunchao Zhang }
5278c3ff71bSJunchao Zhang 
528a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
529a587d139SMark {
530a587d139SMark   PetscErrorCode    ierr;
531a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
532a587d139SMark   PetscFunctionBegin;
533a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
534a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
535a587d139SMark   }
536a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
537a587d139SMark   PetscFunctionReturn(0);
538a587d139SMark }
539a587d139SMark 
540f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
541f0cf5187SStefano Zampini {
542f0cf5187SStefano Zampini   PetscErrorCode   ierr;
543f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
544f0cf5187SStefano Zampini 
545f0cf5187SStefano Zampini   PetscFunctionBegin;
546f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
547f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
548f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
549f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
550f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
551f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
552f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
553f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
554f0cf5187SStefano Zampini   PetscFunctionReturn(0);
555f0cf5187SStefano Zampini }
556f0cf5187SStefano Zampini 
557a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
558a587d139SMark {
559a587d139SMark   PetscErrorCode   ierr;
560a587d139SMark   PetscBool        both = PETSC_FALSE;
561a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
562a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
563a587d139SMark 
564a587d139SMark   PetscFunctionBegin;
565a587d139SMark   if (aijkok && aijkok->a_d.data()) {
566f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
567a587d139SMark     both = PETSC_TRUE;
568a587d139SMark   }
569a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
570a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
571a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
572a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
573a587d139SMark   PetscFunctionReturn(0);
574a587d139SMark }
575a587d139SMark 
576f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
577a587d139SMark {
578a587d139SMark   PetscErrorCode ierr;
579a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
580a587d139SMark 
581a587d139SMark   PetscFunctionBegin;
582f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
583a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
584a587d139SMark     PetscFunctionReturn(0);
585a587d139SMark   }
586f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
587a587d139SMark     PetscBool e;
588a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
589a587d139SMark     if (e) {
590a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
591f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
592a587d139SMark     }
593a587d139SMark   }
594a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
595a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
596a587d139SMark     PetscFunctionReturn(0);
597a587d139SMark   } else {
598a587d139SMark     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
599a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
600a587d139SMark       PetscFunctionReturn(0);
601f0cf5187SStefano Zampini     } else {
602a587d139SMark       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
603f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
604a587d139SMark     }
605a587d139SMark     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
606a587d139SMark     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
607a587d139SMark     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
608f0cf5187SStefano Zampini       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
609f0cf5187SStefano Zampini       Y->offloadmask = PETSC_OFFLOAD_GPU;
610f0cf5187SStefano Zampini       ierr = WaitForKokkos();CHKERRQ(ierr);
611a587d139SMark       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
612f0cf5187SStefano Zampini       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
613f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
614a587d139SMark     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
615a587d139SMark   }
616a587d139SMark   PetscFunctionReturn(0);
617a587d139SMark }
618a587d139SMark 
619*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
620*8f7e8f9dSMark Adams {
621*8f7e8f9dSMark Adams   PetscErrorCode ierr;
622*8f7e8f9dSMark Adams 
623*8f7e8f9dSMark Adams   PetscFunctionBegin;
624*8f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
625*8f7e8f9dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
626*8f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
627*8f7e8f9dSMark Adams   PetscFunctionReturn(0);
628*8f7e8f9dSMark Adams }
629*8f7e8f9dSMark Adams 
6308c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
6318c3ff71bSJunchao Zhang {
6328c3ff71bSJunchao Zhang   PetscFunctionBegin;
6336f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6346f3d89d0SStefano Zampini 
635a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
6368c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
6378c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
6388c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
639a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
640f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
641a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
642a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
6438c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
6448c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
6458c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
6468c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
6478c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
6488c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
6498c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6508c3ff71bSJunchao Zhang }
6518c3ff71bSJunchao Zhang 
6528c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
6538c3ff71bSJunchao Zhang /*C
6548c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
6558c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
6568c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
6578c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
6588c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
6598c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
6608c3ff71bSJunchao Zhang 
6618c3ff71bSJunchao Zhang    Collective
6628c3ff71bSJunchao Zhang 
6638c3ff71bSJunchao Zhang    Input Parameters:
6648c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
6658c3ff71bSJunchao Zhang .  m - number of rows
6668c3ff71bSJunchao Zhang .  n - number of columns
6678c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
6688c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
6698c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
6708c3ff71bSJunchao Zhang 
6718c3ff71bSJunchao Zhang    Output Parameter:
6728c3ff71bSJunchao Zhang .  A - the matrix
6738c3ff71bSJunchao Zhang 
6748c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
6758c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
6768c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
6778c3ff71bSJunchao Zhang 
6788c3ff71bSJunchao Zhang    Notes:
6798c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
6808c3ff71bSJunchao Zhang 
6818c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
6828c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
6838c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
6848c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
6858c3ff71bSJunchao Zhang 
6868c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
6878c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
6888c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
6898c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
6908c3ff71bSJunchao Zhang 
6918c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
6928c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
6938c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
6948c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
6958c3ff71bSJunchao Zhang 
6968c3ff71bSJunchao Zhang    Level: intermediate
6978c3ff71bSJunchao Zhang 
6988c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
6998c3ff71bSJunchao Zhang @*/
7008c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
7018c3ff71bSJunchao Zhang {
7028c3ff71bSJunchao Zhang   PetscErrorCode ierr;
7038c3ff71bSJunchao Zhang 
7048c3ff71bSJunchao Zhang   PetscFunctionBegin;
7058c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
7068c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7078c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
7088c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
7098c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
7108c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
7118c3ff71bSJunchao Zhang }
712930e68a5SMark Adams 
713930e68a5SMark Adams // factorizations
714*8f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member;
715*8f7e8f9dSMark Adams //
716*8f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container.
717*8f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization
718*8f7e8f9dSMark Adams //
719*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info)
720930e68a5SMark Adams {
721*8f7e8f9dSMark Adams   Mat_SeqAIJ         *b=(Mat_SeqAIJ*)B->data;
722*8f7e8f9dSMark Adams   Mat_SeqAIJKokkos   *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
723*8f7e8f9dSMark Adams   IS                 isrow = b->row,isicol = b->icol;
724930e68a5SMark Adams   PetscErrorCode     ierr;
725*8f7e8f9dSMark Adams   const PetscInt     *r_h,*ic_h;
726*8f7e8f9dSMark Adams   const PetscInt     n=A->rmap->n, *ai_d=aijkok->i_d.data(), *aj_d=aijkok->j_d.data(), *bi_d=baijkok->i_d.data(), *bj_d=baijkok->j_d.data(), *bdiag_d = baijkok->diag_d->data();
727*8f7e8f9dSMark Adams   const PetscScalar  *aa_d = aijkok->a_d.data();
728*8f7e8f9dSMark Adams   PetscScalar        *ba_d = baijkok->a_d.data();
729*8f7e8f9dSMark Adams   PetscBool          row_identity,col_identity;
730*8f7e8f9dSMark Adams   PetscInt           nc, Nf, nVec=32; // should be a parameter
731*8f7e8f9dSMark Adams   PetscContainer     container;
732930e68a5SMark Adams 
733930e68a5SMark Adams   PetscFunctionBegin;
734*8f7e8f9dSMark Adams   if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n);
735*8f7e8f9dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr);
736*8f7e8f9dSMark Adams   if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported");
737*8f7e8f9dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
738*8f7e8f9dSMark Adams   if (container) {
739*8f7e8f9dSMark Adams     PetscInt *pNf=NULL, nv;
740*8f7e8f9dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
741*8f7e8f9dSMark Adams     Nf = (*pNf)%1000;
742*8f7e8f9dSMark Adams     nv = (*pNf)/1000;
743*8f7e8f9dSMark Adams     if (nv>0) nVec = nv;
744*8f7e8f9dSMark Adams   } else Nf = 1;
745*8f7e8f9dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
746*8f7e8f9dSMark Adams   ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr);
747*8f7e8f9dSMark Adams   ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr);
748*8f7e8f9dSMark Adams   ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr);
749*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
750*8f7e8f9dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
751*8f7e8f9dSMark Adams #endif
752*8f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
753*8f7e8f9dSMark Adams   {
754*8f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1
755*8f7e8f9dSMark Adams     using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
756*8f7e8f9dSMark Adams     using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>;
757*8f7e8f9dSMark Adams     using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>;
758*8f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n);
759*8f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n);
760*8f7e8f9dSMark Adams     const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc);
761*8f7e8f9dSMark Adams     Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc);
762*8f7e8f9dSMark Adams     size_t flops_h = 0.0;
763*8f7e8f9dSMark Adams     Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h);
764*8f7e8f9dSMark Adams     Kokkos::View<size_t> d_flops_k ("flops");
765*8f7e8f9dSMark Adams     const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256
766*8f7e8f9dSMark Adams     const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1;
767*8f7e8f9dSMark Adams     Kokkos::deep_copy (d_flops_k, h_flops_k);
768*8f7e8f9dSMark Adams     Kokkos::deep_copy (d_r_k, h_r_k);
769*8f7e8f9dSMark Adams     Kokkos::deep_copy (d_ic_k, h_ic_k);
770*8f7e8f9dSMark Adams     // Fill A --> fact
771*8f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) {
772*8f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in Cuda
773*8f7e8f9dSMark Adams         const PetscInt  nloc_i =  (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
774*8f7e8f9dSMark Adams         const PetscInt  *ic = d_ic_k.data(), *r = d_r_k.data();
775*8f7e8f9dSMark Adams         // zero rows of B
776*8f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
777*8f7e8f9dSMark Adams             PetscInt    nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag
778*8f7e8f9dSMark Adams             PetscScalar *baL = ba_d + bi_d[rowb];
779*8f7e8f9dSMark Adams             PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1;
780*8f7e8f9dSMark Adams             /* zero (unfactored row) */
781*8f7e8f9dSMark Adams             for (int j=0;j<nzbL;j++) baL[j] = 0;
782*8f7e8f9dSMark Adams             for (int j=0;j<nzbU;j++) baU[j] = 0;
783*8f7e8f9dSMark Adams           });
784*8f7e8f9dSMark Adams         // copy A into B
785*8f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) {
786*8f7e8f9dSMark Adams             PetscInt          rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
787*8f7e8f9dSMark Adams             const PetscScalar *av    = aa_d + ai_d[rowa];
788*8f7e8f9dSMark Adams             const PetscInt    *ajtmp = aj_d + ai_d[rowa];
789*8f7e8f9dSMark Adams             /* load in initial (unfactored row) */
790*8f7e8f9dSMark Adams             for (int j=0;j<nza;j++) {
791*8f7e8f9dSMark Adams               PetscInt    colb = ic[ajtmp[j]];
792*8f7e8f9dSMark Adams               PetscScalar vala = av[j];
793*8f7e8f9dSMark Adams               if (colb == rowb) {
794*8f7e8f9dSMark Adams                 *(ba_d + bdiag_d[rowb]) = vala;
795*8f7e8f9dSMark Adams               } else {
796*8f7e8f9dSMark Adams                 const PetscInt    *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
797*8f7e8f9dSMark Adams                 PetscScalar       *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]);
798*8f7e8f9dSMark Adams                 PetscInt          nz   = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0;
799*8f7e8f9dSMark Adams                 for (int j=0; j<nz ; j++) {
800*8f7e8f9dSMark Adams                   if (pbj[j] == colb) {
801*8f7e8f9dSMark Adams                     pba[j] = vala;
802*8f7e8f9dSMark Adams                     set++;
803*8f7e8f9dSMark Adams                     break;
804*8f7e8f9dSMark Adams                   }
805*8f7e8f9dSMark Adams                 }
806*8f7e8f9dSMark Adams                 if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n");
807*8f7e8f9dSMark Adams               }
808*8f7e8f9dSMark Adams             }
809*8f7e8f9dSMark Adams           });
810*8f7e8f9dSMark Adams       });
811*8f7e8f9dSMark Adams     Kokkos::fence();
812930e68a5SMark Adams 
813*8f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) {
814*8f7e8f9dSMark Adams         sizet_scr_t     colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL));
815*8f7e8f9dSMark Adams         scalar_scr_t    L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL));
816*8f7e8f9dSMark Adams         sizet_scr_t     flops(team.team_scratch(KOKKOS_SHARED_LEVEL));
817*8f7e8f9dSMark Adams         const PetscInt  field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in Cuda
818*8f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc;
819*8f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; });
820*8f7e8f9dSMark Adams         // A22 panel update for each row A(1,:) and col A(:,1)
821*8f7e8f9dSMark Adams         for (int ii=start; ii<end-1; ii++) {
822*8f7e8f9dSMark Adams           const PetscInt    *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end)
823*8f7e8f9dSMark Adams           const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data  U(i,i+1:end)
824*8f7e8f9dSMark Adams           const PetscInt    nUi_its = nzUi/Ni + !!(nzUi%Ni);
825*8f7e8f9dSMark Adams           const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place
826*8f7e8f9dSMark Adams           Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) {
827*8f7e8f9dSMark Adams               PetscInt kIdx = j*Ni + field_block_idx;
828*8f7e8f9dSMark Adams               if (kIdx >= nzUi) /* void */ ;
829*8f7e8f9dSMark Adams               else {
830*8f7e8f9dSMark Adams                 const PetscInt myk  = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general
831*8f7e8f9dSMark Adams                 const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row
832*8f7e8f9dSMark Adams                 const PetscInt nzL  = bi_d[myk+1] - bi_d[myk]; // size of L_k(:)
833*8f7e8f9dSMark Adams                 size_t         st_idx;
834*8f7e8f9dSMark Adams                 // find and do L(k,i) = A(:k,i) / A(i,i)
835*8f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; });
836*8f7e8f9dSMark Adams                 // get column, there has got to be a better way
837*8f7e8f9dSMark Adams                 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) {
838*8f7e8f9dSMark Adams                     //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii);
839*8f7e8f9dSMark Adams                     if (pjL[j] == ii) {
840*8f7e8f9dSMark Adams                       PetscScalar *pLki = ba_d + bi_d[myk] + j;
841*8f7e8f9dSMark Adams                       idx = j; // output
842*8f7e8f9dSMark Adams                       *pLki = *pLki/Bii; // column scaling:  L(k,i) = A(:k,i) / A(i,i)
843*8f7e8f9dSMark Adams                     }
844*8f7e8f9dSMark Adams                   }, st_idx);
845*8f7e8f9dSMark Adams                 Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); });
846*8f7e8f9dSMark Adams                 if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii);
847*8f7e8f9dSMark Adams                 else { // active row k, do  A_kj -= Lki * U_ij; j \in U(i,:) j != i
848*8f7e8f9dSMark Adams                   // U(i+1,:end)
849*8f7e8f9dSMark Adams                   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U)
850*8f7e8f9dSMark Adams                       PetscScalar Uij = baUi[uiIdx];
851*8f7e8f9dSMark Adams                       PetscInt    col = bjUi[uiIdx];
852*8f7e8f9dSMark Adams                       if (col==myk) {
853*8f7e8f9dSMark Adams                         // A_kk = A_kk - L_ki * U_ij(k)
854*8f7e8f9dSMark Adams                         PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place
855*8f7e8f9dSMark Adams                         *Akkv = *Akkv - L_ki() * Uij; // UiK
856*8f7e8f9dSMark Adams                       } else {
857*8f7e8f9dSMark Adams                         PetscScalar    *start, *end, *pAkjv=NULL;
858*8f7e8f9dSMark Adams                         PetscInt       high, low;
859*8f7e8f9dSMark Adams                         const PetscInt *startj;
860*8f7e8f9dSMark Adams                         if (col<myk) { // L
861*8f7e8f9dSMark Adams                           PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx();
862*8f7e8f9dSMark Adams                           PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row
863*8f7e8f9dSMark Adams                           start = pLki+1; // start at pLki+1, A22(myk,1)
864*8f7e8f9dSMark Adams                           startj= bj_d + bi_d[myk] + idx;
865*8f7e8f9dSMark Adams                           end   = ba_d + bi_d[myk+1];
866*8f7e8f9dSMark Adams                         } else {
867*8f7e8f9dSMark Adams                           PetscInt idx = bdiag_d[myk+1]+1;
868*8f7e8f9dSMark Adams                           start = ba_d + idx;
869*8f7e8f9dSMark Adams                           startj= bj_d + idx;
870*8f7e8f9dSMark Adams                           end   = ba_d + bdiag_d[myk];
871*8f7e8f9dSMark Adams                         }
872*8f7e8f9dSMark Adams                         // search for 'col', use bisection search - TODO
873*8f7e8f9dSMark Adams                         low  = 0;
874*8f7e8f9dSMark Adams                         high = (PetscInt)(end-start);
875*8f7e8f9dSMark Adams                         while (high-low > 5) {
876*8f7e8f9dSMark Adams                           int t = (low+high)/2;
877*8f7e8f9dSMark Adams                           if (startj[t] > col) high = t;
878*8f7e8f9dSMark Adams                           else                 low  = t;
879*8f7e8f9dSMark Adams                         }
880*8f7e8f9dSMark Adams                         for (pAkjv=start+low; pAkjv<start+high; pAkjv++) {
881*8f7e8f9dSMark Adams                           if (startj[pAkjv-start] == col) break;
882*8f7e8f9dSMark Adams                         }
883*8f7e8f9dSMark Adams                         if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col);
884*8f7e8f9dSMark Adams                         *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij
885*8f7e8f9dSMark Adams                       }
886*8f7e8f9dSMark Adams                     });
887*8f7e8f9dSMark Adams                 }
888*8f7e8f9dSMark Adams               }
889*8f7e8f9dSMark Adams             });
890*8f7e8f9dSMark Adams           team.team_barrier(); // this needs to be a league barrier to use more that one SM per block
891*8f7e8f9dSMark Adams           if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); });
892*8f7e8f9dSMark Adams         } /* endof for (i=0; i<n; i++) { */
893*8f7e8f9dSMark Adams         Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; });
894*8f7e8f9dSMark Adams       });
895*8f7e8f9dSMark Adams     Kokkos::fence();
896*8f7e8f9dSMark Adams     Kokkos::deep_copy (h_flops_k, d_flops_k);
897*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
898*8f7e8f9dSMark Adams     ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
899*8f7e8f9dSMark Adams #elif  defined(PETSC_USE_LOG)
900*8f7e8f9dSMark Adams     ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr);
901*8f7e8f9dSMark Adams #endif
902*8f7e8f9dSMark Adams     Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) {
903*8f7e8f9dSMark Adams         const PetscInt  lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni;
904*8f7e8f9dSMark Adams         const PetscInt  start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters
905*8f7e8f9dSMark Adams         /* Invert diagonal for simpler triangular solves */
906*8f7e8f9dSMark Adams         Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) {
907*8f7e8f9dSMark Adams             int i = start + outer_index*Ni + lg_rank%Ni;
908*8f7e8f9dSMark Adams             if (i < end) {
909*8f7e8f9dSMark Adams               PetscScalar *pv = ba_d + bdiag_d[i];
910*8f7e8f9dSMark Adams               *pv = 1.0/(*pv);
911*8f7e8f9dSMark Adams             }
912*8f7e8f9dSMark Adams           });
913*8f7e8f9dSMark Adams       });
914*8f7e8f9dSMark Adams   }
915*8f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG)
916*8f7e8f9dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
917*8f7e8f9dSMark Adams #endif
918*8f7e8f9dSMark Adams   ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr);
919*8f7e8f9dSMark Adams   ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr);
920*8f7e8f9dSMark Adams 
921*8f7e8f9dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
922*8f7e8f9dSMark Adams   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
923*8f7e8f9dSMark Adams   if (b->inode.size) {
924*8f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_Inode;
925*8f7e8f9dSMark Adams   } else if (row_identity && col_identity) {
926*8f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering;
927*8f7e8f9dSMark Adams   } else {
928*8f7e8f9dSMark Adams     B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos
929*8f7e8f9dSMark Adams   }
930*8f7e8f9dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
931*8f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU
932*8f7e8f9dSMark Adams   B->ops->solveadd          = MatSolveAdd_SeqAIJ; // and this
933*8f7e8f9dSMark Adams   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
934*8f7e8f9dSMark Adams   B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
935*8f7e8f9dSMark Adams   B->ops->matsolve          = MatMatSolve_SeqAIJ;
936*8f7e8f9dSMark Adams   B->assembled              = PETSC_TRUE;
937*8f7e8f9dSMark Adams   B->preallocated           = PETSC_TRUE;
938*8f7e8f9dSMark Adams 
939930e68a5SMark Adams   PetscFunctionReturn(0);
940930e68a5SMark Adams }
941930e68a5SMark Adams 
942930e68a5SMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
943930e68a5SMark Adams {
944930e68a5SMark Adams   PetscErrorCode   ierr;
945930e68a5SMark Adams 
946930e68a5SMark Adams   PetscFunctionBegin;
947930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
948930e68a5SMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
949930e68a5SMark Adams   PetscFunctionReturn(0);
950930e68a5SMark Adams }
951930e68a5SMark Adams 
952*8f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
953*8f7e8f9dSMark Adams {
954*8f7e8f9dSMark Adams   PetscErrorCode   ierr;
955*8f7e8f9dSMark Adams   Mat_SeqAIJ       *b=(Mat_SeqAIJ*)B->data;
956*8f7e8f9dSMark Adams   const PetscInt   nrows   = A->rmap->n;
957930e68a5SMark Adams 
958*8f7e8f9dSMark Adams   PetscFunctionBegin;
959*8f7e8f9dSMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
960*8f7e8f9dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE;
961*8f7e8f9dSMark Adams   // move B data into Kokkos
962*8f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok
963*8f7e8f9dSMark Adams   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok
964*8f7e8f9dSMark Adams   {
965*8f7e8f9dSMark Adams     Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
966*8f7e8f9dSMark Adams     if (!baijkok->diag_d) {
967*8f7e8f9dSMark Adams       const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1);
968*8f7e8f9dSMark Adams       baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_diag));
969*8f7e8f9dSMark Adams       Kokkos::deep_copy (*baijkok->diag_d, h_diag);
970*8f7e8f9dSMark Adams     }
971*8f7e8f9dSMark Adams   }
972*8f7e8f9dSMark Adams   PetscFunctionReturn(0);
973*8f7e8f9dSMark Adams }
974*8f7e8f9dSMark Adams 
975*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
976*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat,MatFactorType,Mat*);
977930e68a5SMark Adams 
978930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
979930e68a5SMark Adams {
980930e68a5SMark Adams   PetscErrorCode ierr;
981930e68a5SMark Adams 
982930e68a5SMark Adams   PetscFunctionBegin;
983930e68a5SMark Adams   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
984*8f7e8f9dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr);
985930e68a5SMark Adams   PetscFunctionReturn(0);
986930e68a5SMark Adams }
987930e68a5SMark Adams 
988930e68a5SMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
989930e68a5SMark Adams {
990930e68a5SMark Adams   PetscFunctionBegin;
991930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
992930e68a5SMark Adams   PetscFunctionReturn(0);
993930e68a5SMark Adams }
994930e68a5SMark Adams 
995*8f7e8f9dSMark Adams // use -pc_factor_mat_solver_type kokkosdevice
996*8f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type)
997*8f7e8f9dSMark Adams {
998*8f7e8f9dSMark Adams   PetscFunctionBegin;
999*8f7e8f9dSMark Adams   *type = MATSOLVERKOKKOSDEVICE;
1000*8f7e8f9dSMark Adams   PetscFunctionReturn(0);
1001*8f7e8f9dSMark Adams }
1002*8f7e8f9dSMark Adams 
1003930e68a5SMark Adams /*MC
1004930e68a5SMark Adams   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
1005930e68a5SMark Adams   on a single GPU of type, seqaijkokkos, aijkokkos.
1006930e68a5SMark Adams 
1007930e68a5SMark Adams   Level: beginner
1008930e68a5SMark Adams 
1009930e68a5SMark Adams .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
1010930e68a5SMark Adams M*/
1011930e68a5SMark Adams 
1012930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
1013930e68a5SMark Adams {
1014930e68a5SMark Adams   PetscErrorCode ierr;
1015930e68a5SMark Adams   PetscInt       n = A->rmap->n;
1016930e68a5SMark Adams 
1017930e68a5SMark Adams   PetscFunctionBegin;
1018930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1019930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1020930e68a5SMark Adams   (*B)->factortype = ftype;
1021930e68a5SMark Adams   (*B)->useordering = PETSC_TRUE;
1022930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1023930e68a5SMark Adams 
1024*8f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1025930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1026930e68a5SMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
1027930e68a5SMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1028930e68a5SMark Adams 
1029930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1030930e68a5SMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
1031930e68a5SMark Adams   PetscFunctionReturn(0);
1032930e68a5SMark Adams }
1033*8f7e8f9dSMark Adams 
1034*8f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B)
1035*8f7e8f9dSMark Adams {
1036*8f7e8f9dSMark Adams   PetscErrorCode ierr;
1037*8f7e8f9dSMark Adams   PetscInt       n = A->rmap->n;
1038*8f7e8f9dSMark Adams 
1039*8f7e8f9dSMark Adams   PetscFunctionBegin;
1040*8f7e8f9dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
1041*8f7e8f9dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1042*8f7e8f9dSMark Adams   (*B)->factortype = ftype;
1043*8f7e8f9dSMark Adams   (*B)->useordering = PETSC_TRUE;
1044*8f7e8f9dSMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
1045*8f7e8f9dSMark Adams 
1046*8f7e8f9dSMark Adams   if (ftype == MAT_FACTOR_LU) {
1047*8f7e8f9dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1048*8f7e8f9dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE;
1049*8f7e8f9dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
1050*8f7e8f9dSMark Adams 
1051*8f7e8f9dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1052*8f7e8f9dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr);
1053*8f7e8f9dSMark Adams   PetscFunctionReturn(0);
1054*8f7e8f9dSMark Adams }
1055