xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a3f881fbec03794bf1fbd5c59e880be1171aa47a)
18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h"
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
49f114eb2SJunchao Zhang #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
78c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
118c3ff71bSJunchao Zhang 
128c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13a587d139SMark #include <petscmat.h>
148c3ff71bSJunchao Zhang 
158c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
168c3ff71bSJunchao Zhang 
178c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
188c3ff71bSJunchao Zhang {
198c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
20a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
218c3ff71bSJunchao Zhang 
228c3ff71bSJunchao Zhang   PetscFunctionBegin;
238c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
24a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
25a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
26a587d139SMark   }
278c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
298c3ff71bSJunchao Zhang }
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
328c3ff71bSJunchao Zhang {
338c3ff71bSJunchao Zhang   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
348c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
358c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
368c3ff71bSJunchao Zhang 
378c3ff71bSJunchao Zhang   PetscFunctionBegin;
38*a3f881fbSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJKOKKOS);
398c3ff71bSJunchao Zhang   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
408c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
418c3ff71bSJunchao Zhang     delete aijkok;
428c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
438c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
448c3ff71bSJunchao Zhang     A->spptr             = aijkok;
458c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
468c3ff71bSJunchao Zhang     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
478c3ff71bSJunchao Zhang   }
488c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
498c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
508c3ff71bSJunchao Zhang }
518c3ff71bSJunchao Zhang 
52a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
53a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
54a587d139SMark {
55a587d139SMark   Mat_SeqAIJKokkos *aijkok;
56a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
57a587d139SMark 
58a587d139SMark   PetscFunctionBegin;
59a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
60a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
61a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
62a587d139SMark   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
63a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
64a587d139SMark   PetscFunctionReturn(0);
65a587d139SMark }
66a587d139SMark 
67a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
68a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
69a587d139SMark {
70a587d139SMark   Mat_SeqAIJKokkos *aijkok;
71a587d139SMark 
72a587d139SMark   PetscFunctionBegin;
73a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
74a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
75a587d139SMark     *d_mat = aijkok->device_mat_d.data();
76a587d139SMark   } else {
77a587d139SMark     PetscErrorCode   ierr;
78a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
79a587d139SMark     *d_mat  = NULL;
80a587d139SMark   }
81a587d139SMark   PetscFunctionReturn(0);
82a587d139SMark }
83a587d139SMark 
848c3ff71bSJunchao Zhang /* y = A x */
858c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
868c3ff71bSJunchao Zhang {
878c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
888c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
898c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
908c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
918c3ff71bSJunchao Zhang 
928c3ff71bSJunchao Zhang   PetscFunctionBegin;
938c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
948c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
958c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
968c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
978c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
988c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
998c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
1001e566348SJunchao Zhang   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
101bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
1021e566348SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1038c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1048c3ff71bSJunchao Zhang }
1058c3ff71bSJunchao Zhang 
1068c3ff71bSJunchao Zhang /* y = A^T x */
1078c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1088c3ff71bSJunchao Zhang {
1098c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1108c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1118c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1128c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1138c3ff71bSJunchao Zhang 
1148c3ff71bSJunchao Zhang   PetscFunctionBegin;
1158c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1168c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1178c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1188c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1198c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
1208c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
122bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1231e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1248c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1258c3ff71bSJunchao Zhang }
1268c3ff71bSJunchao Zhang 
1278c3ff71bSJunchao Zhang /* y = A^H x */
1288c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1298c3ff71bSJunchao Zhang {
1308c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1318c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1328c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1338c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1348c3ff71bSJunchao Zhang 
1358c3ff71bSJunchao Zhang   PetscFunctionBegin;
1368c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1378c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1388c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1398c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1408c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
1418c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1428c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
143bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1441e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1458c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1468c3ff71bSJunchao Zhang }
1478c3ff71bSJunchao Zhang 
1488c3ff71bSJunchao Zhang /* z = A x + y */
1498c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
1508c3ff71bSJunchao Zhang {
1518c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1528c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1538c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1548c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1558c3ff71bSJunchao Zhang 
1568c3ff71bSJunchao Zhang   PetscFunctionBegin;
1578c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1588c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1598c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1608c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1618c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1628c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1638c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
1648c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1658c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1668c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
167bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1681e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1708c3ff71bSJunchao Zhang }
1718c3ff71bSJunchao Zhang 
1728c3ff71bSJunchao Zhang /* z = A^T x + y */
1738c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
1748c3ff71bSJunchao Zhang {
1758c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1768c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1778c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1788c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1798c3ff71bSJunchao Zhang 
1808c3ff71bSJunchao Zhang   PetscFunctionBegin;
1818c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1828c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1838c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1848c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1858c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1868c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1878c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
1888c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1898c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1908c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
191bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1921e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1938c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1948c3ff71bSJunchao Zhang }
1958c3ff71bSJunchao Zhang 
1968c3ff71bSJunchao Zhang /* z = A^H x + y */
1978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
1988c3ff71bSJunchao Zhang {
1998c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
2008c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2018c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2028c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2038c3ff71bSJunchao Zhang 
2048c3ff71bSJunchao Zhang   PetscFunctionBegin;
2058c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2068c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2078c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2088c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2098c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2108c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2118c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
2128c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2138c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2148c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
215bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2161e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2178c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2188c3ff71bSJunchao Zhang }
2198c3ff71bSJunchao Zhang 
2203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2218c3ff71bSJunchao Zhang {
2228c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2238c3ff71bSJunchao Zhang   Mat            B;
224c58ef05eSStefano Zampini   Mat_SeqAIJ     *aij;
2258c3ff71bSJunchao Zhang 
2268c3ff71bSJunchao Zhang   PetscFunctionBegin;
227*a3f881fbSStefano Zampini   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
2288c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
2298c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2308c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
2318c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2328c3ff71bSJunchao Zhang   }
2338c3ff71bSJunchao Zhang 
2348c3ff71bSJunchao Zhang   B    = *newmat;
2358c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2368c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
2378c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2380d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
239c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
240c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
241c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
2428c3ff71bSJunchao Zhang 
2438c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2458c3ff71bSJunchao Zhang }
2468c3ff71bSJunchao Zhang 
2478c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
2488c3ff71bSJunchao Zhang {
2498c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2508c3ff71bSJunchao Zhang 
2518c3ff71bSJunchao Zhang   PetscFunctionBegin;
2528c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2538c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2548c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2558c3ff71bSJunchao Zhang }
2568c3ff71bSJunchao Zhang 
2578c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
2588c3ff71bSJunchao Zhang {
2598c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
2608c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2618c3ff71bSJunchao Zhang 
2628c3ff71bSJunchao Zhang   PetscFunctionBegin;
263a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
264a587d139SMark     delete aijkok->colmap_d;
265a587d139SMark     delete aijkok->i_uncompressed_d;
266a587d139SMark   }
2678c3ff71bSJunchao Zhang   delete aijkok;
2688c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2698c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2708c3ff71bSJunchao Zhang }
2718c3ff71bSJunchao Zhang 
272*a3f881fbSStefano Zampini #if 0
273*a3f881fbSStefano Zampini static PetscErrorCode MatMatKernelHandleDestroy_Private(void* data)
274*a3f881fbSStefano Zampini {
275*a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh = static_cast<MatMatKernelHandle_t *>(data);
276*a3f881fbSStefano Zampini 
277*a3f881fbSStefano Zampini   PetscFunctionBegin;
278*a3f881fbSStefano Zampini   delete kh;
279*a3f881fbSStefano Zampini   PetscFunctionReturn(0);
280*a3f881fbSStefano Zampini }
281*a3f881fbSStefano Zampini 
282*a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
283*a3f881fbSStefano Zampini {
284*a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
285*a3f881fbSStefano Zampini   Mat                  A,B;
286*a3f881fbSStefano Zampini   MatProductType       ptype;
287*a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
288*a3f881fbSStefano Zampini   bool                 tA,tB;
289*a3f881fbSStefano Zampini   PetscErrorCode       ierr;
290*a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
291*a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
292*a3f881fbSStefano Zampini 
293*a3f881fbSStefano Zampini   PetscFunctionBegin;
294*a3f881fbSStefano Zampini   MatCheckProduct(C,1);
295*a3f881fbSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
296*a3f881fbSStefano Zampini   A = product->A;
297*a3f881fbSStefano Zampini   B = product->B;
298*a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
299*a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
300*a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
301*a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
302*a3f881fbSStefano Zampini   ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr);
303*a3f881fbSStefano Zampini   kh   = static_cast<MatMatKernelHandle_t*>(C->product->data);
304*a3f881fbSStefano Zampini   ptype = product->type;
305*a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
306*a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
307*a3f881fbSStefano Zampini   switch (ptype) {
308*a3f881fbSStefano Zampini   case MATPRODUCT_AB:
309*a3f881fbSStefano Zampini     tA = false;
310*a3f881fbSStefano Zampini     tB = false;
311*a3f881fbSStefano Zampini     break;
312*a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
313*a3f881fbSStefano Zampini     tA = true;
314*a3f881fbSStefano Zampini     tB = false;
315*a3f881fbSStefano Zampini     break;
316*a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
317*a3f881fbSStefano Zampini     tA = false;
318*a3f881fbSStefano Zampini     tB = true;
319*a3f881fbSStefano Zampini     break;
320*a3f881fbSStefano Zampini   default:
321*a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
322*a3f881fbSStefano Zampini   }
323*a3f881fbSStefano Zampini 
324*a3f881fbSStefano Zampini   KokkosSparse::spgemm_numeric(*kh, akok->csr, tA, bkok->csr, tB, ckok->csr);
325*a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
326*a3f881fbSStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
327*a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
328*a3f881fbSStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
329*a3f881fbSStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
330*a3f881fbSStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
331*a3f881fbSStefano Zampini   c->reallocs         = 0;
332*a3f881fbSStefano Zampini   C->info.mallocs    += 0;
333*a3f881fbSStefano Zampini   C->info.nz_unneeded = 0;
334*a3f881fbSStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
335*a3f881fbSStefano Zampini   C->num_ass++;
336*a3f881fbSStefano Zampini   /* we can remove these calls when MatSeqAIJGetArray operations are used everywhere! */
337*a3f881fbSStefano Zampini   // TODO JZ, copy from device to host since most of Petsc code for AIJ matrices does not use MatSeqAIJGetArray()
338*a3f881fbSStefano Zampini   C->offloadmask = PETSC_OFFLOAD_BOTH;
339*a3f881fbSStefano Zampini   // Also, we should add support to copy back from device to host
340*a3f881fbSStefano Zampini   PetscFunctionReturn(0);
341*a3f881fbSStefano Zampini }
342*a3f881fbSStefano Zampini 
343*a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
344*a3f881fbSStefano Zampini {
345*a3f881fbSStefano Zampini   Mat_Product          *product = C->product;
346*a3f881fbSStefano Zampini   Mat                  A,B;
347*a3f881fbSStefano Zampini   MatProductType       ptype;
348*a3f881fbSStefano Zampini   Mat_SeqAIJKokkos     *akok,*bkok,*ckok;
349*a3f881fbSStefano Zampini   PetscInt             m,n,k;
350*a3f881fbSStefano Zampini   bool                 tA,tB;
351*a3f881fbSStefano Zampini   PetscErrorCode       ierr;
352*a3f881fbSStefano Zampini   Mat_SeqAIJ           *c;
353*a3f881fbSStefano Zampini   MatMatKernelHandle_t *kh;
354*a3f881fbSStefano Zampini 
355*a3f881fbSStefano Zampini   PetscFunctionBegin;
356*a3f881fbSStefano Zampini   MatCheckProduct(C,1);
357*a3f881fbSStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
358*a3f881fbSStefano Zampini   A = product->A;
359*a3f881fbSStefano Zampini   B = product->B;
360*a3f881fbSStefano Zampini   // TODO only copy the i,j data, not the values
361*a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
362*a3f881fbSStefano Zampini   ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr);
363*a3f881fbSStefano Zampini   akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
364*a3f881fbSStefano Zampini   bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr);
365*a3f881fbSStefano Zampini   ptype = product->type;
366*a3f881fbSStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
367*a3f881fbSStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
368*a3f881fbSStefano Zampini   switch (ptype) {
369*a3f881fbSStefano Zampini   case MATPRODUCT_AB:
370*a3f881fbSStefano Zampini     tA = false;
371*a3f881fbSStefano Zampini     tB = false;
372*a3f881fbSStefano Zampini     m = A->rmap->n;
373*a3f881fbSStefano Zampini     n = B->cmap->n;
374*a3f881fbSStefano Zampini     break;
375*a3f881fbSStefano Zampini   case MATPRODUCT_AtB:
376*a3f881fbSStefano Zampini     tA = true;
377*a3f881fbSStefano Zampini     tB = false;
378*a3f881fbSStefano Zampini     m = A->cmap->n;
379*a3f881fbSStefano Zampini     n = B->cmap->n;
380*a3f881fbSStefano Zampini     break;
381*a3f881fbSStefano Zampini   case MATPRODUCT_ABt:
382*a3f881fbSStefano Zampini     tA = false;
383*a3f881fbSStefano Zampini     tB = true;
384*a3f881fbSStefano Zampini     m = A->rmap->n;
385*a3f881fbSStefano Zampini     n = B->rmap->n;
386*a3f881fbSStefano Zampini     break;
387*a3f881fbSStefano Zampini   default:
388*a3f881fbSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
389*a3f881fbSStefano Zampini   }
390*a3f881fbSStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
391*a3f881fbSStefano Zampini   ierr = MatSetType(C,MATSEQAIJKOKKOS);CHKERRQ(ierr);
392*a3f881fbSStefano Zampini   c = (Mat_SeqAIJ*)C->data;
393*a3f881fbSStefano Zampini 
394*a3f881fbSStefano Zampini   kh = new MatMatKernelHandle_t;
395*a3f881fbSStefano Zampini   // TODO SZ: ADD RUNTIME SELECTION OF THESE
396*a3f881fbSStefano Zampini   kh->set_team_work_size(16);
397*a3f881fbSStefano Zampini   kh->set_dynamic_scheduling(true);
398*a3f881fbSStefano Zampini   // Select an spgemm algorithm, limited by configuration at compile-time and
399*a3f881fbSStefano Zampini   // set via the handle Some options: {SPGEMM_KK_MEMORY, SPGEMM_KK_SPEED,
400*a3f881fbSStefano Zampini   // SPGEMM_KK_MEMSPEED, /*SPGEMM_CUSPARSE, */ SPGEMM_MKL}
401*a3f881fbSStefano Zampini   std::string myalg("SPGEMM_KK_MEMORY");
402*a3f881fbSStefano Zampini   kh->create_spgemm_handle(KokkosSparse::StringToSPGEMMAlgorithm(myalg));
403*a3f881fbSStefano Zampini 
404*a3f881fbSStefano Zampini   /////////////////////////////////////
405*a3f881fbSStefano Zampini   // TODO JZ
406*a3f881fbSStefano Zampini   ckok = NULL; //new Mat_SeqAIJKokkos();
407*a3f881fbSStefano Zampini   C->spptr = ckok;
408*a3f881fbSStefano Zampini   KokkosCsrMatrix_t ccsr; // here only to have the code compile
409*a3f881fbSStefano Zampini   KokkosSparse::spgemm_symbolic(*kh, akok->csr, tA, bkok->csr, tB, ccsr);
410*a3f881fbSStefano Zampini   //cerr = WaitForKOKKOS();CHKERRCUDA(cerr);
411*a3f881fbSStefano Zampini   //c->nz = get_nnz_from_ccsr
412*a3f881fbSStefano Zampini   //////////////////////////////////////
413*a3f881fbSStefano Zampini   c->singlemalloc = PETSC_FALSE;
414*a3f881fbSStefano Zampini   c->free_a       = PETSC_TRUE;
415*a3f881fbSStefano Zampini   c->free_ij      = PETSC_TRUE;
416*a3f881fbSStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
417*a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
418*a3f881fbSStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
419*a3f881fbSStefano Zampini   ////////////////////////////////////
420*a3f881fbSStefano Zampini   // TODO JZ copy from device to c->i and c->j
421*a3f881fbSStefano Zampini   ////////////////////////////////////
422*a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
423*a3f881fbSStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
424*a3f881fbSStefano Zampini   c->maxnz = c->nz;
425*a3f881fbSStefano Zampini   c->nonzerorowcnt = 0;
426*a3f881fbSStefano Zampini   c->rmax = 0;
427*a3f881fbSStefano Zampini   for (k = 0; k < m; k++) {
428*a3f881fbSStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
429*a3f881fbSStefano Zampini     c->ilen[k] = c->imax[k] = nn;
430*a3f881fbSStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
431*a3f881fbSStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
432*a3f881fbSStefano Zampini   }
433*a3f881fbSStefano Zampini 
434*a3f881fbSStefano Zampini   C->nonzerostate++;
435*a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
436*a3f881fbSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
437*a3f881fbSStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
438*a3f881fbSStefano Zampini   ckok->nonzerostate = C->nonzerostate;
439*a3f881fbSStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
440*a3f881fbSStefano Zampini   C->preallocated  = PETSC_TRUE;
441*a3f881fbSStefano Zampini   C->assembled     = PETSC_FALSE;
442*a3f881fbSStefano Zampini   C->was_assembled = PETSC_FALSE;
443*a3f881fbSStefano Zampini 
444*a3f881fbSStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
445*a3f881fbSStefano Zampini   C->product->data = kh;
446*a3f881fbSStefano Zampini   C->product->destroy = MatMatKernelHandleDestroy_Private;
447*a3f881fbSStefano Zampini   PetscFunctionReturn(0);
448*a3f881fbSStefano Zampini }
449*a3f881fbSStefano Zampini 
450*a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */
451*a3f881fbSStefano Zampini PETSC_UNUSED static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
452*a3f881fbSStefano Zampini {
453*a3f881fbSStefano Zampini   Mat_Product    *product = mat->product;
454*a3f881fbSStefano Zampini   PetscErrorCode ierr;
455*a3f881fbSStefano Zampini   PetscBool      Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE;
456*a3f881fbSStefano Zampini 
457*a3f881fbSStefano Zampini   PetscFunctionBegin;
458*a3f881fbSStefano Zampini   MatCheckProduct(mat,1);
459*a3f881fbSStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr);
460*a3f881fbSStefano Zampini   if (product->type == MATPRODUCT_ABC) {
461*a3f881fbSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr);
462*a3f881fbSStefano Zampini   }
463*a3f881fbSStefano Zampini   if (Biskok && Ciskok) {
464*a3f881fbSStefano Zampini     switch (product->type) {
465*a3f881fbSStefano Zampini     case MATPRODUCT_AB:
466*a3f881fbSStefano Zampini     case MATPRODUCT_AtB:
467*a3f881fbSStefano Zampini     case MATPRODUCT_ABt:
468*a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
469*a3f881fbSStefano Zampini       break;
470*a3f881fbSStefano Zampini     case MATPRODUCT_PtAP:
471*a3f881fbSStefano Zampini     case MATPRODUCT_RARt:
472*a3f881fbSStefano Zampini     case MATPRODUCT_ABC:
473*a3f881fbSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
474*a3f881fbSStefano Zampini       break;
475*a3f881fbSStefano Zampini     default:
476*a3f881fbSStefano Zampini       break;
477*a3f881fbSStefano Zampini     }
478*a3f881fbSStefano Zampini   } else { /* fallback for AIJ */
479*a3f881fbSStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
480*a3f881fbSStefano Zampini   }
481*a3f881fbSStefano Zampini   PetscFunctionReturn(0);
482*a3f881fbSStefano Zampini }
483*a3f881fbSStefano Zampini #endif
484*a3f881fbSStefano Zampini 
4858c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
4868c3ff71bSJunchao Zhang {
4878c3ff71bSJunchao Zhang   PetscErrorCode ierr;
4888c3ff71bSJunchao Zhang 
4898c3ff71bSJunchao Zhang   PetscFunctionBegin;
4908c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
4918c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
4928c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
4938c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4948c3ff71bSJunchao Zhang }
4958c3ff71bSJunchao Zhang 
496a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
497a587d139SMark {
498a587d139SMark   PetscErrorCode    ierr;
499a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
500a587d139SMark   PetscFunctionBegin;
501a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
502a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
503a587d139SMark   }
504a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
505a587d139SMark   PetscFunctionReturn(0);
506a587d139SMark }
507a587d139SMark 
508a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
509a587d139SMark {
510a587d139SMark   PetscErrorCode             ierr;
511a587d139SMark   PetscBool                  both = PETSC_FALSE;
512a587d139SMark   Mat_SeqAIJKokkos           *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
513a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
514a587d139SMark 
515a587d139SMark   PetscFunctionBegin;
516a587d139SMark   if (aijkok && aijkok->a_d.data()) {
517a587d139SMark     Kokkos::parallel_for (aijkok->a_d.size(), KOKKOS_LAMBDA (int i) { aijkok->a_d(i) = 0; });
518a587d139SMark     both = PETSC_TRUE;
519a587d139SMark   }
520a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
521a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
522a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
523a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
524a587d139SMark 
525a587d139SMark   PetscFunctionReturn(0);
526a587d139SMark }
527a587d139SMark 
528a587d139SMark static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
529a587d139SMark {
530a587d139SMark   PetscErrorCode ierr;
531a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
532a587d139SMark   PetscBool      flgx,flgy;
533a587d139SMark 
534a587d139SMark   PetscFunctionBegin;
535a587d139SMark   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
536a587d139SMark   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
537a587d139SMark   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
538a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJKOKKOS,&flgy);CHKERRQ(ierr);
539a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJKOKKOS,&flgx);CHKERRQ(ierr);
540a587d139SMark   if (!flgx || !flgy) {
541a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
542a587d139SMark     PetscFunctionReturn(0);
543a587d139SMark   }
544a587d139SMark   if (str == DIFFERENT_NONZERO_PATTERN) {
545a587d139SMark     if (x->nz == y->nz) {
546a587d139SMark       PetscBool e;
547a587d139SMark       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
548a587d139SMark       if (e) {
549a587d139SMark         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
550a587d139SMark         if (e) {
551a587d139SMark           str = SAME_NONZERO_PATTERN;
552a587d139SMark         }
553a587d139SMark       }
554a587d139SMark     }
555a587d139SMark   }
556a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
557a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
558a587d139SMark     PetscFunctionReturn(0);
559a587d139SMark   } else {
560a587d139SMark     if (Y->offloadmask==PETSC_OFFLOAD_CPU && X->offloadmask==PETSC_OFFLOAD_CPU) {
561a587d139SMark       ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
562a587d139SMark       PetscFunctionReturn(0);
563a587d139SMark     } else if ((Y->offloadmask==PETSC_OFFLOAD_GPU || Y->offloadmask==PETSC_OFFLOAD_BOTH) && X->offloadmask == PETSC_OFFLOAD_CPU) {
564a587d139SMark       ierr    = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
565a587d139SMark     } else if ((X->offloadmask==PETSC_OFFLOAD_GPU || X->offloadmask==PETSC_OFFLOAD_BOTH) && Y->offloadmask == PETSC_OFFLOAD_CPU) {
566a587d139SMark       ierr    = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); /* promote Y */
567a587d139SMark     }
568a587d139SMark     {
569a587d139SMark       Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
570a587d139SMark       Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
571a587d139SMark       if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
572a587d139SMark         Kokkos::parallel_for (aijkokY->a_d.size(), KOKKOS_LAMBDA (int i) { aijkokY->a_d(i) += a*aijkokX->a_d(i); });
573a587d139SMark         Kokkos::deep_copy(aijkokY->a_h,aijkokY->a_d); // we could not copy and keep GPU
574a587d139SMark         Y->offloadmask = PETSC_OFFLOAD_BOTH;
575a587d139SMark         ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
576a587d139SMark       } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
577a587d139SMark     }
578a587d139SMark   }
579a587d139SMark   PetscFunctionReturn(0);
580a587d139SMark }
581a587d139SMark 
5828c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
5838c3ff71bSJunchao Zhang {
5848c3ff71bSJunchao Zhang   PetscFunctionBegin;
585a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
5868c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
5878c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
5888c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
589a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
590a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
591*a3f881fbSStefano Zampini   //A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
5928c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
5938c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
5948c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
5958c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
5968c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
5978c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
5988c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
5998c3ff71bSJunchao Zhang }
6008c3ff71bSJunchao Zhang 
6018c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
6028c3ff71bSJunchao Zhang /*C
6038c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
6048c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
6058c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
6068c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
6078c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
6088c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
6098c3ff71bSJunchao Zhang 
6108c3ff71bSJunchao Zhang    Collective
6118c3ff71bSJunchao Zhang 
6128c3ff71bSJunchao Zhang    Input Parameters:
6138c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
6148c3ff71bSJunchao Zhang .  m - number of rows
6158c3ff71bSJunchao Zhang .  n - number of columns
6168c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
6178c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
6188c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
6198c3ff71bSJunchao Zhang 
6208c3ff71bSJunchao Zhang    Output Parameter:
6218c3ff71bSJunchao Zhang .  A - the matrix
6228c3ff71bSJunchao Zhang 
6238c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
6248c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
6258c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
6268c3ff71bSJunchao Zhang 
6278c3ff71bSJunchao Zhang    Notes:
6288c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
6298c3ff71bSJunchao Zhang 
6308c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
6318c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
6328c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
6338c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
6348c3ff71bSJunchao Zhang 
6358c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
6368c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
6378c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
6388c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
6398c3ff71bSJunchao Zhang 
6408c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
6418c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
6428c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
6438c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
6448c3ff71bSJunchao Zhang 
6458c3ff71bSJunchao Zhang    Level: intermediate
6468c3ff71bSJunchao Zhang 
6478c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
6488c3ff71bSJunchao Zhang @*/
6498c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
6508c3ff71bSJunchao Zhang {
6518c3ff71bSJunchao Zhang   PetscErrorCode ierr;
6528c3ff71bSJunchao Zhang 
6538c3ff71bSJunchao Zhang   PetscFunctionBegin;
6548c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
6558c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6568c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
6578c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
6588c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
6598c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
6608c3ff71bSJunchao Zhang }
661