xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision f0cf51870644c48937fa5d201e39613e257305c9)
18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h"
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
49f114eb2SJunchao Zhang #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
7*f0cf5187SStefano Zampini #include <KokkosBlas.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
118c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
128c3ff71bSJunchao Zhang 
138c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
14a587d139SMark #include <petscmat.h>
158c3ff71bSJunchao Zhang 
168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
178c3ff71bSJunchao Zhang 
188c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
198c3ff71bSJunchao Zhang {
208c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
21a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
228c3ff71bSJunchao Zhang 
238c3ff71bSJunchao Zhang   PetscFunctionBegin;
248c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
25a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
26a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
27a587d139SMark   }
288c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
298c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
308c3ff71bSJunchao Zhang }
318c3ff71bSJunchao Zhang 
328c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
338c3ff71bSJunchao Zhang {
348c3ff71bSJunchao Zhang   Mat_SeqAIJ       *aijseq = (Mat_SeqAIJ*)A->data;
358c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
368c3ff71bSJunchao Zhang   PetscInt         nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
378c3ff71bSJunchao Zhang 
388c3ff71bSJunchao Zhang   PetscFunctionBegin;
39a3f881fbSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
408c3ff71bSJunchao Zhang   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
418c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
428c3ff71bSJunchao Zhang     delete aijkok;
438c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
448c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
458c3ff71bSJunchao Zhang     A->spptr             = aijkok;
468c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
478c3ff71bSJunchao Zhang     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
488c3ff71bSJunchao Zhang   }
498c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
518c3ff71bSJunchao Zhang }
528c3ff71bSJunchao Zhang 
53*f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
54*f0cf5187SStefano Zampini {
55*f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
56*f0cf5187SStefano Zampini 
57*f0cf5187SStefano Zampini   PetscFunctionBegin;
58*f0cf5187SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
59*f0cf5187SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
60*f0cf5187SStefano Zampini     if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK");
61*f0cf5187SStefano Zampini     Kokkos::deep_copy(aijkok->a_h,aijkok->a_d);
62*f0cf5187SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
63*f0cf5187SStefano Zampini   }
64*f0cf5187SStefano Zampini   PetscFunctionReturn(0);
65*f0cf5187SStefano Zampini }
66*f0cf5187SStefano Zampini 
67*f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[])
68*f0cf5187SStefano Zampini {
69*f0cf5187SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
70*f0cf5187SStefano Zampini   PetscErrorCode ierr;
71*f0cf5187SStefano Zampini 
72*f0cf5187SStefano Zampini   PetscFunctionBegin;
73*f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
74*f0cf5187SStefano Zampini   *array = a->a;
75*f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
76*f0cf5187SStefano Zampini   PetscFunctionReturn(0);
77*f0cf5187SStefano Zampini }
78*f0cf5187SStefano Zampini 
79a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
80a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
81a587d139SMark {
82a587d139SMark   Mat_SeqAIJKokkos *aijkok;
83a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
84a587d139SMark 
85a587d139SMark   PetscFunctionBegin;
86a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
87a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
88a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
89a587d139SMark   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
90a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
91a587d139SMark   PetscFunctionReturn(0);
92a587d139SMark }
93a587d139SMark 
94a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
95a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
96a587d139SMark {
97a587d139SMark   Mat_SeqAIJKokkos *aijkok;
98a587d139SMark 
99a587d139SMark   PetscFunctionBegin;
100a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
101a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
102a587d139SMark     *d_mat = aijkok->device_mat_d.data();
103a587d139SMark   } else {
104a587d139SMark     PetscErrorCode   ierr;
105a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
106a587d139SMark     *d_mat  = NULL;
107a587d139SMark   }
108a587d139SMark   PetscFunctionReturn(0);
109a587d139SMark }
110a587d139SMark 
1118c3ff71bSJunchao Zhang /* y = A x */
1128c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1138c3ff71bSJunchao Zhang {
1148c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1158c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1168c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1178c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1188c3ff71bSJunchao Zhang 
1198c3ff71bSJunchao Zhang   PetscFunctionBegin;
1208c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1228c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1238c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1248c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
1258c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1268c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
1271e566348SJunchao Zhang   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
128bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
1291e566348SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1308c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1318c3ff71bSJunchao Zhang }
1328c3ff71bSJunchao Zhang 
1338c3ff71bSJunchao Zhang /* y = A^T x */
1348c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1358c3ff71bSJunchao Zhang {
1368c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1378c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1388c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1398c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1408c3ff71bSJunchao Zhang 
1418c3ff71bSJunchao Zhang   PetscFunctionBegin;
1428c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1438c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1448c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1458c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1468c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
1478c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1488c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
149bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1501e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1528c3ff71bSJunchao Zhang }
1538c3ff71bSJunchao Zhang 
1548c3ff71bSJunchao Zhang /* y = A^H x */
1558c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1568c3ff71bSJunchao Zhang {
1578c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1598c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1608c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1618c3ff71bSJunchao Zhang 
1628c3ff71bSJunchao Zhang   PetscFunctionBegin;
1638c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1648c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1658c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1668c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1678c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
1688c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1698c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
170bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1711e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1738c3ff71bSJunchao Zhang }
1748c3ff71bSJunchao Zhang 
1758c3ff71bSJunchao Zhang /* z = A x + y */
1768c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
1778c3ff71bSJunchao Zhang {
1788c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1798c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1808c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1818c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang   PetscFunctionBegin;
1848c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1858c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1868c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1878c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1888c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1898c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1908c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
1918c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1928c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1938c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
194bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1951e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1968c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1978c3ff71bSJunchao Zhang }
1988c3ff71bSJunchao Zhang 
1998c3ff71bSJunchao Zhang /* z = A^T x + y */
2008c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2018c3ff71bSJunchao Zhang {
2028c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2038c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2048c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2058c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang   PetscFunctionBegin;
2088c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2098c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2108c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2118c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2128c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2138c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2148c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
2158c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2168c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2178c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
218bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2191e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2218c3ff71bSJunchao Zhang }
2228c3ff71bSJunchao Zhang 
2238c3ff71bSJunchao Zhang /* z = A^H x + y */
2248c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
2258c3ff71bSJunchao Zhang {
2268c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2278c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2288c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2298c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2308c3ff71bSJunchao Zhang 
2318c3ff71bSJunchao Zhang   PetscFunctionBegin;
2328c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2338c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2348c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2358c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2368c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2378c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2388c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
2398c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2408c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2418c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
242bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2431e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2458c3ff71bSJunchao Zhang }
2468c3ff71bSJunchao Zhang 
2473d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2488c3ff71bSJunchao Zhang {
2498c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2508c3ff71bSJunchao Zhang   Mat            B;
251c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
2528c3ff71bSJunchao Zhang 
2538c3ff71bSJunchao Zhang   PetscFunctionBegin;
254a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
2558c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
2568c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2578c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
2588c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2598c3ff71bSJunchao Zhang   }
2608c3ff71bSJunchao Zhang 
2618c3ff71bSJunchao Zhang   B    = *newmat;
2628c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2638c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
2648c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2650d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
266*f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJKokkos);CHKERRQ(ierr);
267c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
268c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
269c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
2708c3ff71bSJunchao Zhang 
2718c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2728c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2738c3ff71bSJunchao Zhang }
2748c3ff71bSJunchao Zhang 
2758c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
2768c3ff71bSJunchao Zhang {
2778c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2788c3ff71bSJunchao Zhang 
2798c3ff71bSJunchao Zhang   PetscFunctionBegin;
2808c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2818c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2828c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2838c3ff71bSJunchao Zhang }
2848c3ff71bSJunchao Zhang 
2858c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
2868c3ff71bSJunchao Zhang {
2878c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
2888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2898c3ff71bSJunchao Zhang 
2908c3ff71bSJunchao Zhang   PetscFunctionBegin;
291a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
292a587d139SMark     delete aijkok->colmap_d;
293a587d139SMark     delete aijkok->i_uncompressed_d;
294a587d139SMark   }
2958c3ff71bSJunchao Zhang   delete aijkok;
296*f0cf5187SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",NULL);CHKERRQ(ierr);
2978c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2988c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2998c3ff71bSJunchao Zhang }
3008c3ff71bSJunchao Zhang 
301a3f881fbSStefano Zampini #if 0
302a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
303a3f881fbSStefano Zampini {
304a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
305a3f881fbSStefano Zampini 
306a3f881fbSStefano Zampini   PetscFunctionBegin;
307a3f881fbSStefano Zampini   delete kh;
308a3f881fbSStefano Zampini   PetscFunctionReturn(0);
309a3f881fbSStefano Zampini }
310a3f881fbSStefano Zampini 
311a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
312a3f881fbSStefano Zampini {
313a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
314a3f881fbSStefano Zampini   Mat                  A,B;
315a3f881fbSStefano Zampini   MatProductType       ptype;
316a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
317a3f881fbSStefano Zampini   bool                 tA,tB;
318a3f881fbSStefano Zampini   PetscErrorCode       ierr;
319a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
320a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
321a3f881fbSStefano Zampini 
322a3f881fbSStefano Zampini   PetscFunctionBegin;
323a3f881fbSStefano Zampini   MatCheckProduct(C,1);
324a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
325a3f881fbSStefano Zampini   A = product->A;
326a3f881fbSStefano Zampini   B = product->B;
327a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
328a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
329a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
330a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
331a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
332a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
333a3f881fbSStefano Zampini   ptype = product->type;
334a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
335a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
336a3f881fbSStefano Zampini   switch (ptype) {
337a3f881fbSStefano Zampini   case MATPRODUCT_AB:
338a3f881fbSStefano Zampini     tA = false;
339a3f881fbSStefano Zampini     tB = false;
340a3f881fbSStefano Zampini     break;
341a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
342a3f881fbSStefano Zampini     tA = true;
343a3f881fbSStefano Zampini     tB = false;
344a3f881fbSStefano Zampini     break;
345a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
346a3f881fbSStefano Zampini     tA = false;
347a3f881fbSStefano Zampini     tB = true;
348a3f881fbSStefano Zampini     break;
349a3f881fbSStefano Zampini   default:
350a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
351a3f881fbSStefano Zampini   }
352a3f881fbSStefano Zampini 
353a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
354a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
355a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
356a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
357a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
358a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
359a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
360a3f881fbSStefano Zampini   c->reallocs         = 0;
361a3f881fbSStefano Zampini   C->info.mallocs    += 0;
362a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
363a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
364a3f881fbSStefano Zampini   C->num_ass++;
365a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
366a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
367a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
368a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
369a3f881fbSStefano Zampini   PetscFunctionReturn(0);
370a3f881fbSStefano Zampini }
371a3f881fbSStefano Zampini 
372a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
373a3f881fbSStefano Zampini {
374a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
375a3f881fbSStefano Zampini   Mat                  A,B;
376a3f881fbSStefano Zampini   MatProductType       ptype;
377a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
378a3f881fbSStefano Zampini   PetscInt             m,n,k;
379a3f881fbSStefano Zampini   bool                 tA,tB;
380a3f881fbSStefano Zampini   PetscErrorCode       ierr;
381a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
382a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
383a3f881fbSStefano Zampini 
384a3f881fbSStefano Zampini   PetscFunctionBegin;
385a3f881fbSStefano Zampini   MatCheckProduct(C,1);
386a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
387a3f881fbSStefano Zampini   A = product->A;
388a3f881fbSStefano Zampini   B = product->B;
389a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
390a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
391a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
392a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
393a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
394a3f881fbSStefano Zampini   ptype = product->type;
395a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
396a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
397a3f881fbSStefano Zampini   switch (ptype) {
398a3f881fbSStefano Zampini   case MATPRODUCT_AB:
399a3f881fbSStefano Zampini     tA = false;
400a3f881fbSStefano Zampini     tB = false;
401a3f881fbSStefano Zampini     m = A->rmap->n;
402a3f881fbSStefano Zampini     n = B->cmap->n;
403a3f881fbSStefano Zampini     break;
404a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
405a3f881fbSStefano Zampini     tA = true;
406a3f881fbSStefano Zampini     tB = false;
407a3f881fbSStefano Zampini     m = A->cmap->n;
408a3f881fbSStefano Zampini     n = B->cmap->n;
409a3f881fbSStefano Zampini     break;
410a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
411a3f881fbSStefano Zampini     tA = false;
412a3f881fbSStefano Zampini     tB = true;
413a3f881fbSStefano Zampini     m = A->rmap->n;
414a3f881fbSStefano Zampini     n = B->rmap->n;
415a3f881fbSStefano Zampini     break;
416a3f881fbSStefano Zampini   default:
417a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
418a3f881fbSStefano Zampini   }
419a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
420a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
421a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
422a3f881fbSStefano Zampini 
423a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
424a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
425a3f881fbSStefano Zampini   kh->set_team_work_size(16);
426a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
427a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
428a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
429a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
430a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
431a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
432a3f881fbSStefano Zampini 
433a3f881fbSStefano Zampini   /////////////////////////////////////
434a3f881fbSStefano Zampini   // TODO JZ
435a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
436a3f881fbSStefano Zampini   C->spptr = ckok;
437a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
438a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
439a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
440a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
441a3f881fbSStefano Zampini   //////////////////////////////////////
442a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
443a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
444a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
445a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
446a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
447a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
448a3f881fbSStefano Zampini   ////////////////////////////////////
449a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
450a3f881fbSStefano Zampini   ////////////////////////////////////
451a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
452a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
453a3f881fbSStefano Zampini   c->maxnz = c->nz;
454a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
455a3f881fbSStefano Zampini   c->rmax = 0;
456a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
457a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
458a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
459a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
460a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
461a3f881fbSStefano Zampini   }
462a3f881fbSStefano Zampini 
463a3f881fbSStefano Zampini   C->nonzerostate++;
464a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
465a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
466a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
467a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
468a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
469a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
470a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
471a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
472a3f881fbSStefano Zampini 
473a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
474a3f881fbSStefano Zampini   C->product->data = kh;
475a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
476a3f881fbSStefano Zampini   PetscFunctionReturn(0);
477a3f881fbSStefano Zampini }
478a3f881fbSStefano Zampini 
479a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
480a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
481a3f881fbSStefano Zampini {
482a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
483a3f881fbSStefano Zampini   PetscErrorCode ierr;
484a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
485a3f881fbSStefano Zampini 
486a3f881fbSStefano Zampini   PetscFunctionBegin;
487a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
488a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
489a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
490a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
491a3f881fbSStefano Zampini   }
492a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
493a3f881fbSStefano Zampini     switch (product->type) {
494a3f881fbSStefano Zampini     case MATPRODUCT_AB:
495a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
496a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
497a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
498a3f881fbSStefano Zampini       break;
499a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
500a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
501a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
502a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
503a3f881fbSStefano Zampini       break;
504a3f881fbSStefano Zampini     default:
505a3f881fbSStefano Zampini       break;
506a3f881fbSStefano Zampini     }
507a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
508a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
509a3f881fbSStefano Zampini   }
510a3f881fbSStefano Zampini   PetscFunctionReturn(0);
511a3f881fbSStefano Zampini }
512a3f881fbSStefano Zampini #endif
513a3f881fbSStefano Zampini 
5148c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
5158c3ff71bSJunchao Zhang {
5168c3ff71bSJunchao Zhang   PetscErrorCode ierr;
5178c3ff71bSJunchao Zhang 
5188c3ff71bSJunchao Zhang   PetscFunctionBegin;
5198c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
5208c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
5218c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5228c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5238c3ff71bSJunchao Zhang }
5248c3ff71bSJunchao Zhang 
525a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
526a587d139SMark {
527a587d139SMark   PetscErrorCode    ierr;
528a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
529a587d139SMark   PetscFunctionBegin;
530a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
531a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
532a587d139SMark   }
533a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
534a587d139SMark   PetscFunctionReturn(0);
535a587d139SMark }
536a587d139SMark 
537*f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
538*f0cf5187SStefano Zampini {
539*f0cf5187SStefano Zampini   PetscErrorCode   ierr;
540*f0cf5187SStefano Zampini   Mat_SeqAIJKokkos *aijkok;
541*f0cf5187SStefano Zampini 
542*f0cf5187SStefano Zampini   PetscFunctionBegin;
543*f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
544*f0cf5187SStefano Zampini   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
545*f0cf5187SStefano Zampini   KokkosBlas::scal(aijkok->a_d,a,aijkok->a_d);
546*f0cf5187SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
547*f0cf5187SStefano Zampini   ierr = WaitForKokkos();CHKERRQ(ierr);
548*f0cf5187SStefano Zampini   ierr = PetscLogGpuFlops(aijkok->a_d.size());CHKERRQ(ierr);
549*f0cf5187SStefano Zampini   // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
550*f0cf5187SStefano Zampini   ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr);
551*f0cf5187SStefano Zampini   PetscFunctionReturn(0);
552*f0cf5187SStefano Zampini }
553*f0cf5187SStefano Zampini 
554a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
555a587d139SMark {
556a587d139SMark   PetscErrorCode   ierr;
557a587d139SMark   PetscBool        both = PETSC_FALSE;
558a587d139SMark   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
559a587d139SMark   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)A->data;
560a587d139SMark 
561a587d139SMark   PetscFunctionBegin;
562a587d139SMark   if (aijkok && aijkok->a_d.data()) {
563*f0cf5187SStefano Zampini     KokkosBlas::fill(aijkok->a_d,0.0);
564a587d139SMark     both = PETSC_TRUE;
565a587d139SMark   }
566a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
567a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
568a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
569a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
570a587d139SMark   PetscFunctionReturn(0);
571a587d139SMark }
572a587d139SMark 
573*f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str)
574a587d139SMark {
575a587d139SMark   PetscErrorCode ierr;
576a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
577a587d139SMark 
578a587d139SMark   PetscFunctionBegin;
579*f0cf5187SStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
580a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
581a587d139SMark     PetscFunctionReturn(0);
582a587d139SMark   }
583*f0cf5187SStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) {
584a587d139SMark     PetscBool e;
585a587d139SMark     ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
586a587d139SMark     if (e) {
587a587d139SMark       ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
588*f0cf5187SStefano Zampini       if (e) str = SAME_NONZERO_PATTERN;
589a587d139SMark     }
590a587d139SMark   }
591a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
592a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
593a587d139SMark     PetscFunctionReturn(0);
594a587d139SMark   } else {
595a587d139SMark     if (Y->offloadmask == PETSC_OFFLOAD_CPU && X->offloadmask == PETSC_OFFLOAD_CPU) {
596a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
597a587d139SMark       PetscFunctionReturn(0);
598*f0cf5187SStefano Zampini     } else {
599a587d139SMark       ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
600*f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr);
601a587d139SMark     }
602a587d139SMark     Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
603a587d139SMark     Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
604a587d139SMark     if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
605*f0cf5187SStefano Zampini       KokkosBlas::axpy(a,aijkokX->a_d,aijkokY->a_d);
606*f0cf5187SStefano Zampini       Y->offloadmask = PETSC_OFFLOAD_GPU;
607*f0cf5187SStefano Zampini       ierr = WaitForKokkos();CHKERRQ(ierr);
608a587d139SMark       ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
609*f0cf5187SStefano Zampini       // TODO Remove: this can be removed once we implement matmat operations with KOKKOS
610*f0cf5187SStefano Zampini       ierr = MatSeqAIJKokkosSyncHost(Y);CHKERRQ(ierr);
611a587d139SMark     } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
612a587d139SMark   }
613a587d139SMark   PetscFunctionReturn(0);
614a587d139SMark }
615a587d139SMark 
6168c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
6178c3ff71bSJunchao Zhang {
6188c3ff71bSJunchao Zhang   PetscFunctionBegin;
619a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
6208c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
6218c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
6228c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
623a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
624*f0cf5187SStefano Zampini   A->ops->scale                     = MatScale_SeqAIJKokkos;
625a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
626a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
6278c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
6288c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
6298c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
6308c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
6318c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
6328c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
6338c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6348c3ff71bSJunchao Zhang }
6358c3ff71bSJunchao Zhang 
6368c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
6378c3ff71bSJunchao Zhang /*C
6388c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
6398c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
6408c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
6418c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
6428c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
6438c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
6448c3ff71bSJunchao Zhang 
6458c3ff71bSJunchao Zhang    Collective
6468c3ff71bSJunchao Zhang 
6478c3ff71bSJunchao Zhang    Input Parameters:
6488c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
6498c3ff71bSJunchao Zhang .  m - number of rows
6508c3ff71bSJunchao Zhang .  n - number of columns
6518c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
6528c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
6538c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
6548c3ff71bSJunchao Zhang 
6558c3ff71bSJunchao Zhang    Output Parameter:
6568c3ff71bSJunchao Zhang .  A - the matrix
6578c3ff71bSJunchao Zhang 
6588c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
6598c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
6608c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
6618c3ff71bSJunchao Zhang 
6628c3ff71bSJunchao Zhang    Notes:
6638c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
6648c3ff71bSJunchao Zhang 
6658c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
6668c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
6678c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
6688c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
6698c3ff71bSJunchao Zhang 
6708c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
6718c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
6728c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
6738c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
6748c3ff71bSJunchao Zhang 
6758c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
6768c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
6778c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
6788c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
6798c3ff71bSJunchao Zhang 
6808c3ff71bSJunchao Zhang    Level: intermediate
6818c3ff71bSJunchao Zhang 
6828c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
6838c3ff71bSJunchao Zhang @*/
6848c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
6858c3ff71bSJunchao Zhang {
6868c3ff71bSJunchao Zhang   PetscErrorCode ierr;
6878c3ff71bSJunchao Zhang 
6888c3ff71bSJunchao Zhang   PetscFunctionBegin;
6898c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
6908c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6918c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
6928c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
6938c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
6948c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6958c3ff71bSJunchao Zhang }
696