xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 6f3d89d0b697a3d79aa40878724d2c791c95edac)
18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h"
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
49f114eb2SJunchao Zhang #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
118c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
128c3ff71bSJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14a587d139SMark #include <petscmat.h>
158c3ff71bSJunchao Zhang 
168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
198c3ff71bSJunchao Zhang {
208c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
21a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
228c3ff71bSJunchao Zhang 
238c3ff71bSJunchao Zhang   PetscFunctionBegin;
248c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
26a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27a587d139SMark   }
288c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
298c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
308c3ff71bSJunchao Zhang }
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
338c3ff71bSJunchao Zhang {
348c3ff71bSJunchao Zhang   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
358c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
368c3ff71bSJunchao Zhang   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
378c3ff71bSJunchao Zhang 
388c3ff71bSJunchao Zhang   PetscFunctionBegin;
39a3f881fbSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
408c3ff71bSJunchao Zhang   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
418c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
428c3ff71bSJunchao Zhang     delete aijkok;
438c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
448c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
458c3ff71bSJunchao Zhang     A->spptr             = aijkok;
468c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
478c3ff71bSJunchao Zhang     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
488c3ff71bSJunchao Zhang   }
498c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
518c3ff71bSJunchao Zhang }
528c3ff71bSJunchao Zhang 
53f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54f0cf5187SStefano Zampini {
55f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56f0cf5187SStefano Zampini 
57f0cf5187SStefano Zampini   PetscFunctionBegin;
58f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61f0cf5187SStefano Zampini     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
63f0cf5187SStefano Zampini   }
64f0cf5187SStefano Zampini   PetscFunctionReturn(0);
65f0cf5187SStefano Zampini }
66f0cf5187SStefano Zampini 
67f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68f0cf5187SStefano Zampini {
69f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70f0cf5187SStefano Zampini   PetscErrorCode ierr;
71f0cf5187SStefano Zampini 
72f0cf5187SStefano Zampini   PetscFunctionBegin;
73f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74f0cf5187SStefano Zampini   *array = a->a;
75f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
76f0cf5187SStefano Zampini   PetscFunctionReturn(0);
77f0cf5187SStefano Zampini }
78f0cf5187SStefano Zampini 
79a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81a587d139SMark {
82a587d139SMark   Mat_SeqAIJKokkos *aijkok;
83a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84a587d139SMark 
85a587d139SMark   PetscFunctionBegin;
86a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89a587d139SMark   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91a587d139SMark   PetscFunctionReturn(0);
92a587d139SMark }
93a587d139SMark 
94a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96a587d139SMark {
97a587d139SMark   Mat_SeqAIJKokkos *aijkok;
98a587d139SMark 
99a587d139SMark   PetscFunctionBegin;
100a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
102a587d139SMark     *d_mat = aijkok->device_mat_d.data();
103a587d139SMark   } else {
104a587d139SMark     PetscErrorCode   ierr;
105a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106a587d139SMark     *d_mat  = NULL;
107a587d139SMark   }
108a587d139SMark   PetscFunctionReturn(0);
109a587d139SMark }
110a587d139SMark 
1118c3ff71bSJunchao Zhang /* y = A x */
1128c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1138c3ff71bSJunchao Zhang {
1148c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1158c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1168c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1178c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1188c3ff71bSJunchao Zhang 
1198c3ff71bSJunchao Zhang   PetscFunctionBegin;
1208c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1228c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1238c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1248c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
1258c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1268c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
1271e566348SJunchao Zhang   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
1291e566348SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1308c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1318c3ff71bSJunchao Zhang }
1328c3ff71bSJunchao Zhang 
1338c3ff71bSJunchao Zhang /* y = A^T x */
1348c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1358c3ff71bSJunchao Zhang {
1368c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1378c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1388c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1398c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1408c3ff71bSJunchao Zhang 
1418c3ff71bSJunchao Zhang   PetscFunctionBegin;
1428c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1438c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1448c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1458c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1468c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
1478c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1488c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1501e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1528c3ff71bSJunchao Zhang }
1538c3ff71bSJunchao Zhang 
1548c3ff71bSJunchao Zhang /* y = A^H x */
1558c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1568c3ff71bSJunchao Zhang {
1578c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1598c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1608c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1618c3ff71bSJunchao Zhang 
1628c3ff71bSJunchao Zhang   PetscFunctionBegin;
1638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1648c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1658c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1668c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1678c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
1688c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1698c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1711e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1738c3ff71bSJunchao Zhang }
1748c3ff71bSJunchao Zhang 
1758c3ff71bSJunchao Zhang /* z = A x + y */
1768c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
1778c3ff71bSJunchao Zhang {
1788c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1808c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1818c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang   PetscFunctionBegin;
1848c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1858c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1868c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1878c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1888c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1898c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1908c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
1918c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1928c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1938c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1951e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1978c3ff71bSJunchao Zhang }
1988c3ff71bSJunchao Zhang 
1998c3ff71bSJunchao Zhang /* z = A^T x + y */
2008c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2018c3ff71bSJunchao Zhang {
2028c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2038c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2048c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2058c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang   PetscFunctionBegin;
2088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2098c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2108c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2118c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2128c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2138c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2148c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
2158c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2168c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2178c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2191e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2218c3ff71bSJunchao Zhang }
2228c3ff71bSJunchao Zhang 
2238c3ff71bSJunchao Zhang /* z = A^H x + y */
2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2258c3ff71bSJunchao Zhang {
2268c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2278c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2288c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2298c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2308c3ff71bSJunchao Zhang 
2318c3ff71bSJunchao Zhang   PetscFunctionBegin;
2328c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2338c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2348c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2358c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2368c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2378c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2388c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
2398c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2418c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2431e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2458c3ff71bSJunchao Zhang }
2468c3ff71bSJunchao Zhang 
2473d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2488c3ff71bSJunchao Zhang {
2498c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2508c3ff71bSJunchao Zhang   Mat            B;
251c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
2528c3ff71bSJunchao Zhang 
2538c3ff71bSJunchao Zhang   PetscFunctionBegin;
254a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
2568c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2578c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
2588c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2598c3ff71bSJunchao Zhang   }
2608c3ff71bSJunchao Zhang 
2618c3ff71bSJunchao Zhang   B    = *newmat;
2628c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2638c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
2648c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2650d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
269c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
2708c3ff71bSJunchao Zhang 
2718c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2738c3ff71bSJunchao Zhang }
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
2768c3ff71bSJunchao Zhang {
2778c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2788c3ff71bSJunchao Zhang 
2798c3ff71bSJunchao Zhang   PetscFunctionBegin;
2808c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2818c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2838c3ff71bSJunchao Zhang }
2848c3ff71bSJunchao Zhang 
2858c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
2868c3ff71bSJunchao Zhang {
2878c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
2888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
291a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
292a587d139SMark     delete aijkok->colmap_d;
293a587d139SMark     delete aijkok->i_uncompressed_d;
294a587d139SMark   }
2958c3ff71bSJunchao Zhang   delete aijkok;
296f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
297930e68a5SMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2988c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2998c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3008c3ff71bSJunchao Zhang }
3018c3ff71bSJunchao Zhang 
302a3f881fbSStefano Zampini #if 0
303a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
304a3f881fbSStefano Zampini {
305a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
306a3f881fbSStefano Zampini 
307a3f881fbSStefano Zampini   PetscFunctionBegin;
308a3f881fbSStefano Zampini   delete kh;
309a3f881fbSStefano Zampini   PetscFunctionReturn(0);
310a3f881fbSStefano Zampini }
311a3f881fbSStefano Zampini 
312a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
313a3f881fbSStefano Zampini {
314a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
315a3f881fbSStefano Zampini   Mat                  A,B;
316a3f881fbSStefano Zampini   MatProductType       ptype;
317a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
318a3f881fbSStefano Zampini   bool                 tA,tB;
319a3f881fbSStefano Zampini   PetscErrorCode       ierr;
320a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
321a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
322a3f881fbSStefano Zampini 
323a3f881fbSStefano Zampini   PetscFunctionBegin;
324a3f881fbSStefano Zampini   MatCheckProduct(C,1);
325a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
326a3f881fbSStefano Zampini   A = product->A;
327a3f881fbSStefano Zampini   B = product->B;
328a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
329a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
330a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
331a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
332a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
333a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
334a3f881fbSStefano Zampini   ptype = product->type;
335a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
336a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
337a3f881fbSStefano Zampini   switch (ptype) {
338a3f881fbSStefano Zampini   case MATPRODUCT_AB:
339a3f881fbSStefano Zampini     tA = false;
340a3f881fbSStefano Zampini     tB = false;
341a3f881fbSStefano Zampini     break;
342a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
343a3f881fbSStefano Zampini     tA = true;
344a3f881fbSStefano Zampini     tB = false;
345a3f881fbSStefano Zampini     break;
346a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
347a3f881fbSStefano Zampini     tA = false;
348a3f881fbSStefano Zampini     tB = true;
349a3f881fbSStefano Zampini     break;
350a3f881fbSStefano Zampini   default:
351a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
352a3f881fbSStefano Zampini   }
353a3f881fbSStefano Zampini 
354a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
355a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
356a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
357a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
358a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
359a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
360a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
361a3f881fbSStefano Zampini   c->reallocs         = 0;
362a3f881fbSStefano Zampini   C->info.mallocs    += 0;
363a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
364a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
365a3f881fbSStefano Zampini   C->num_ass++;
366a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
367a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
368a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
369a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
370a3f881fbSStefano Zampini   PetscFunctionReturn(0);
371a3f881fbSStefano Zampini }
372a3f881fbSStefano Zampini 
373a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
374a3f881fbSStefano Zampini {
375a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
376a3f881fbSStefano Zampini   Mat                  A,B;
377a3f881fbSStefano Zampini   MatProductType       ptype;
378a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
379a3f881fbSStefano Zampini   PetscInt             m,n,k;
380a3f881fbSStefano Zampini   bool                 tA,tB;
381a3f881fbSStefano Zampini   PetscErrorCode       ierr;
382a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
383a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
384a3f881fbSStefano Zampini 
385a3f881fbSStefano Zampini   PetscFunctionBegin;
386a3f881fbSStefano Zampini   MatCheckProduct(C,1);
387a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
388a3f881fbSStefano Zampini   A = product->A;
389a3f881fbSStefano Zampini   B = product->B;
390a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
391a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
392a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
393a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
394a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
395a3f881fbSStefano Zampini   ptype = product->type;
396a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
397a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
398a3f881fbSStefano Zampini   switch (ptype) {
399a3f881fbSStefano Zampini   case MATPRODUCT_AB:
400a3f881fbSStefano Zampini     tA = false;
401a3f881fbSStefano Zampini     tB = false;
402a3f881fbSStefano Zampini     m = A->rmap->n;
403a3f881fbSStefano Zampini     n = B->cmap->n;
404a3f881fbSStefano Zampini     break;
405a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
406a3f881fbSStefano Zampini     tA = true;
407a3f881fbSStefano Zampini     tB = false;
408a3f881fbSStefano Zampini     m = A->cmap->n;
409a3f881fbSStefano Zampini     n = B->cmap->n;
410a3f881fbSStefano Zampini     break;
411a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
412a3f881fbSStefano Zampini     tA = false;
413a3f881fbSStefano Zampini     tB = true;
414a3f881fbSStefano Zampini     m = A->rmap->n;
415a3f881fbSStefano Zampini     n = B->rmap->n;
416a3f881fbSStefano Zampini     break;
417a3f881fbSStefano Zampini   default:
418a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
419a3f881fbSStefano Zampini   }
420a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
421a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
422a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
423a3f881fbSStefano Zampini 
424a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
425a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
426a3f881fbSStefano Zampini   kh->set_team_work_size(16);
427a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
428a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
429a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
430a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
431a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
432a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
433a3f881fbSStefano Zampini 
434a3f881fbSStefano Zampini   /////////////////////////////////////
435a3f881fbSStefano Zampini   // TODO JZ
436a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
437a3f881fbSStefano Zampini   C->spptr = ckok;
438a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
439a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
440a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
441a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
442a3f881fbSStefano Zampini   //////////////////////////////////////
443a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
444a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
445a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
446a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
447a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
448a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
449a3f881fbSStefano Zampini   ////////////////////////////////////
450a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
451a3f881fbSStefano Zampini   ////////////////////////////////////
452a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
453a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
454a3f881fbSStefano Zampini   c->maxnz = c->nz;
455a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
456a3f881fbSStefano Zampini   c->rmax = 0;
457a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
458a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
459a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
460a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
461a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
462a3f881fbSStefano Zampini   }
463a3f881fbSStefano Zampini 
464a3f881fbSStefano Zampini   C->nonzerostate++;
465a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
466a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
467a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
468a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
469a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
470a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
471a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
472a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
473a3f881fbSStefano Zampini 
474a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
475a3f881fbSStefano Zampini   C->product->data = kh;
476a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
477a3f881fbSStefano Zampini   PetscFunctionReturn(0);
478a3f881fbSStefano Zampini }
479a3f881fbSStefano Zampini 
480a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
481a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
482a3f881fbSStefano Zampini {
483a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
484a3f881fbSStefano Zampini   PetscErrorCode ierr;
485a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
486a3f881fbSStefano Zampini 
487a3f881fbSStefano Zampini   PetscFunctionBegin;
488a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
489a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
490a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
491a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
492a3f881fbSStefano Zampini   }
493a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
494a3f881fbSStefano Zampini     switch (product->type) {
495a3f881fbSStefano Zampini     case MATPRODUCT_AB:
496a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
497a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
498a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
499a3f881fbSStefano Zampini       break;
500a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
501a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
502a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
503a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
504a3f881fbSStefano Zampini       break;
505a3f881fbSStefano Zampini     default:
506a3f881fbSStefano Zampini       break;
507a3f881fbSStefano Zampini     }
508a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
509a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
510a3f881fbSStefano Zampini   }
511a3f881fbSStefano Zampini   PetscFunctionReturn(0);
512a3f881fbSStefano Zampini }
513a3f881fbSStefano Zampini #endif
514a3f881fbSStefano Zampini 
5158c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
5168c3ff71bSJunchao Zhang {
5178c3ff71bSJunchao Zhang   PetscErrorCode ierr;
5188c3ff71bSJunchao Zhang 
5198c3ff71bSJunchao Zhang   PetscFunctionBegin;
5208c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
5218c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
5228c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5248c3ff71bSJunchao Zhang }
5258c3ff71bSJunchao Zhang 
526a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
527a587d139SMark {
528a587d139SMark   PetscErrorCode    ierr;
529a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
530a587d139SMark   PetscFunctionBegin;
531a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
532a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
533a587d139SMark   }
534a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
535a587d139SMark   PetscFunctionReturn(0);
536a587d139SMark }
537a587d139SMark 
538f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
539f0cf5187SStefano Zampini {
540f0cf5187SStefano Zampini   PetscErrorCode   ierr;
541f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
542f0cf5187SStefano Zampini 
543f0cf5187SStefano Zampini   PetscFunctionBegin;
544f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
545f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
546f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
547f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
548f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
549f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
550f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
551f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
552f0cf5187SStefano Zampini   PetscFunctionReturn(0);
553f0cf5187SStefano Zampini }
554f0cf5187SStefano Zampini 
555a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
556a587d139SMark {
557a587d139SMark   PetscErrorCode   ierr;
558a587d139SMark   PetscBool        both = PETSC_FALSE;
559a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
560a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
561a587d139SMark 
562a587d139SMark   PetscFunctionBegin;
563a587d139SMark   if (aijkok && aijkok->a_d.data()) {
564f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
565a587d139SMark     both = PETSC_TRUE;
566a587d139SMark   }
567a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
568a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
569a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
570a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
571a587d139SMark   PetscFunctionReturn(0);
572a587d139SMark }
573a587d139SMark 
574f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
575a587d139SMark {
576a587d139SMark   PetscErrorCode ierr;
577a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
578a587d139SMark 
579a587d139SMark   PetscFunctionBegin;
580f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
581a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
582a587d139SMark     PetscFunctionReturn(0);
583a587d139SMark   }
584f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
585a587d139SMark     PetscBool e;
586a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
587a587d139SMark     if (e) {
588a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
589f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
590a587d139SMark     }
591a587d139SMark   }
592a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
593a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
594a587d139SMark     PetscFunctionReturn(0);
595a587d139SMark   } else {
596a587d139SMark     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
597a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
598a587d139SMark       PetscFunctionReturn(0);
599f0cf5187SStefano Zampini     } else {
600a587d139SMark       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
601f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
602a587d139SMark     }
603a587d139SMark     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
604a587d139SMark     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
605a587d139SMark     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
606f0cf5187SStefano Zampini       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
607f0cf5187SStefano Zampini       Y->offloadmask = PETSC_OFFLOAD_GPU;
608f0cf5187SStefano Zampini       ierr = WaitForKokkos();CHKERRQ(ierr);
609a587d139SMark       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
610f0cf5187SStefano Zampini       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
611f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
612a587d139SMark     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
613a587d139SMark   }
614a587d139SMark   PetscFunctionReturn(0);
615a587d139SMark }
616a587d139SMark 
6178c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
6188c3ff71bSJunchao Zhang {
6198c3ff71bSJunchao Zhang   PetscFunctionBegin;
620*6f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
621*6f3d89d0SStefano Zampini 
622a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
6238c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
6248c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
6258c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
626a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
627f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
628a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
629a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
6308c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
6318c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
6328c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
6338c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
6348c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
6358c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
6368c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6378c3ff71bSJunchao Zhang }
6388c3ff71bSJunchao Zhang 
6398c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
6408c3ff71bSJunchao Zhang /*C
6418c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
6428c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
6438c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
6448c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
6458c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
6468c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
6478c3ff71bSJunchao Zhang 
6488c3ff71bSJunchao Zhang    Collective
6498c3ff71bSJunchao Zhang 
6508c3ff71bSJunchao Zhang    Input Parameters:
6518c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
6528c3ff71bSJunchao Zhang .  m - number of rows
6538c3ff71bSJunchao Zhang .  n - number of columns
6548c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
6558c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
6568c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
6578c3ff71bSJunchao Zhang 
6588c3ff71bSJunchao Zhang    Output Parameter:
6598c3ff71bSJunchao Zhang .  A - the matrix
6608c3ff71bSJunchao Zhang 
6618c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
6628c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
6638c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
6648c3ff71bSJunchao Zhang 
6658c3ff71bSJunchao Zhang    Notes:
6668c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
6678c3ff71bSJunchao Zhang 
6688c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
6698c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
6708c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
6718c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
6728c3ff71bSJunchao Zhang 
6738c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
6748c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
6758c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
6768c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
6778c3ff71bSJunchao Zhang 
6788c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
6798c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
6808c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
6818c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
6828c3ff71bSJunchao Zhang 
6838c3ff71bSJunchao Zhang    Level: intermediate
6848c3ff71bSJunchao Zhang 
6858c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
6868c3ff71bSJunchao Zhang @*/
6878c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
6888c3ff71bSJunchao Zhang {
6898c3ff71bSJunchao Zhang   PetscErrorCode ierr;
6908c3ff71bSJunchao Zhang 
6918c3ff71bSJunchao Zhang   PetscFunctionBegin;
6928c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
6938c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6948c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
6958c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
6968c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
6978c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6988c3ff71bSJunchao Zhang }
699930e68a5SMark Adams 
700930e68a5SMark Adams // factorizations
701930e68a5SMark Adams 
702930e68a5SMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOS(Mat B,Mat A,const MatFactorInfo *info)
703930e68a5SMark Adams {
704930e68a5SMark Adams   // Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
705930e68a5SMark Adams   // IS             isrow = b->row,iscol = b->col;
706930e68a5SMark Adams   // PetscBool      row_identity,col_identity;
707930e68a5SMark Adams   PetscErrorCode ierr;
708930e68a5SMark Adams 
709930e68a5SMark Adams   PetscFunctionBegin;
710930e68a5SMark Adams   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
711930e68a5SMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
712930e68a5SMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
713930e68a5SMark Adams   // solves stay on CPU now
714930e68a5SMark Adams   /* determine which version of MatSolve needs to be used. */
715930e68a5SMark Adams   // ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
716930e68a5SMark Adams   // ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
717930e68a5SMark Adams   // if (row_identity && col_identity) {
718930e68a5SMark Adams   //   B->ops->solve = MatSolve_SeqAIJKOKKOS_NaturalOrdering;
719930e68a5SMark Adams   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS_NaturalOrdering;
720930e68a5SMark Adams   //   B->ops->matsolve = NULL;
721930e68a5SMark Adams   //   B->ops->matsolvetranspose = NULL;
722930e68a5SMark Adams   // } else {
723930e68a5SMark Adams   //   B->ops->solve = MatSolve_SeqAIJKOKKOS;
724930e68a5SMark Adams   //   B->ops->solvetranspose = MatSolveTranspose_SeqAIJKOKKOS;
725930e68a5SMark Adams   //   B->ops->matsolve = NULL;
726930e68a5SMark Adams   //   B->ops->matsolvetranspose = NULL;
727930e68a5SMark Adams   // }
728930e68a5SMark Adams 
729930e68a5SMark Adams   /* get the triangular factors */
730930e68a5SMark Adams   // ierr = MatSeqAIJKOKKOSILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
731930e68a5SMark Adams   PetscFunctionReturn(0);
732930e68a5SMark Adams }
733930e68a5SMark Adams 
734930e68a5SMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOS(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
735930e68a5SMark Adams {
736930e68a5SMark Adams   PetscErrorCode               ierr;
737930e68a5SMark Adams 
738930e68a5SMark Adams   PetscFunctionBegin;
739930e68a5SMark Adams   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
740930e68a5SMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOS;
741930e68a5SMark Adams   PetscFunctionReturn(0);
742930e68a5SMark Adams }
743930e68a5SMark Adams 
744930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat,MatFactorType,Mat*);
745930e68a5SMark Adams 
746930e68a5SMark Adams 
747930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void)
748930e68a5SMark Adams {
749930e68a5SMark Adams   PetscErrorCode ierr;
750930e68a5SMark Adams 
751930e68a5SMark Adams   PetscFunctionBegin;
752930e68a5SMark Adams   ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos);CHKERRQ(ierr);
753930e68a5SMark Adams   PetscFunctionReturn(0);
754930e68a5SMark Adams }
755930e68a5SMark Adams 
756930e68a5SMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos(Mat A,MatSolverType *type)
757930e68a5SMark Adams {
758930e68a5SMark Adams   PetscFunctionBegin;
759930e68a5SMark Adams   *type = MATSOLVERKOKKOS;
760930e68a5SMark Adams   PetscFunctionReturn(0);
761930e68a5SMark Adams }
762930e68a5SMark Adams 
763930e68a5SMark Adams /*MC
764930e68a5SMark Adams   MATSOLVERKOKKOS = "kokkos" - A matrix solver type providing triangular solvers for sequential matrices
765930e68a5SMark Adams   on a single GPU of type, seqaijkokkos, aijkokkos.
766930e68a5SMark Adams 
767930e68a5SMark Adams   Level: beginner
768930e68a5SMark Adams 
769930e68a5SMark Adams .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKOKKOS(), MATAIJKOKKOS, MatCreateAIJKOKKOS(), MatKOKKOSSetFormat(), MatKOKKOSStorageFormat, MatKOKKOSFormatOperation
770930e68a5SMark Adams M*/
771930e68a5SMark Adams 
772930e68a5SMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos(Mat A,MatFactorType ftype,Mat *B)
773930e68a5SMark Adams {
774930e68a5SMark Adams   PetscErrorCode ierr;
775930e68a5SMark Adams   PetscInt       n = A->rmap->n;
776930e68a5SMark Adams 
777930e68a5SMark Adams   PetscFunctionBegin;
778930e68a5SMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
779930e68a5SMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
780930e68a5SMark Adams   (*B)->factortype = ftype;
781930e68a5SMark Adams   (*B)->useordering = PETSC_TRUE;
782930e68a5SMark Adams   ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
783930e68a5SMark Adams 
784930e68a5SMark Adams   if (ftype == MAT_FACTOR_LU /* || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT*/) {
785930e68a5SMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
786930e68a5SMark Adams     // (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKOKKOS;
787930e68a5SMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKOKKOS;
788930e68a5SMark Adams     // } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
789930e68a5SMark Adams     // (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKOKKOS;
790930e68a5SMark Adams     // (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKOKKOS;
791930e68a5SMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types");
792930e68a5SMark Adams 
793930e68a5SMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
794930e68a5SMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos);CHKERRQ(ierr);
795930e68a5SMark Adams   PetscFunctionReturn(0);
796930e68a5SMark Adams }
797