xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision a587d1398bd356a7db20a09283619c724e82d622)
18c3ff71bSJunchao Zhang #include "petsc/private/petscimpl.h"
28c3ff71bSJunchao Zhang #include <petscsystypes.h>
38c3ff71bSJunchao Zhang #include <petscerror.h>
49f114eb2SJunchao Zhang #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp>
78c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp>
88c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
98c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp>
108c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h>
118c3ff71bSJunchao Zhang 
128c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
13*a587d139SMark #include <petscmat.h>
148c3ff71bSJunchao Zhang 
158c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
168c3ff71bSJunchao Zhang 
178c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)
188c3ff71bSJunchao Zhang {
198c3ff71bSJunchao Zhang   PetscErrorCode    ierr;
20*a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
218c3ff71bSJunchao Zhang 
228c3ff71bSJunchao Zhang   PetscFunctionBegin;
238c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
24*a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
25*a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
26*a587d139SMark   }
278c3ff71bSJunchao Zhang   /* Don't build (or update) the Mat_SeqAIJKokkos struct. We delay it to the very last moment until we need it. */
288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
298c3ff71bSJunchao Zhang }
308c3ff71bSJunchao Zhang 
318c3ff71bSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
328c3ff71bSJunchao Zhang {
338c3ff71bSJunchao Zhang   Mat_SeqAIJ                *aijseq = (Mat_SeqAIJ*)A->data;
348c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos          *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
358c3ff71bSJunchao Zhang   PetscInt                  nrows   = A->rmap->n,ncols = A->cmap->n,nnz = aijseq->nz;
368c3ff71bSJunchao Zhang 
378c3ff71bSJunchao Zhang   PetscFunctionBegin;
388c3ff71bSJunchao Zhang   /* If aijkok is not built yet or the nonzero pattern on CPU has changed, build aijkok from the scratch */
398c3ff71bSJunchao Zhang   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) {
408c3ff71bSJunchao Zhang     delete aijkok;
418c3ff71bSJunchao Zhang     aijkok               = new Mat_SeqAIJKokkos(nrows,ncols,nnz,aijseq->i,aijseq->j,aijseq->a);
428c3ff71bSJunchao Zhang     aijkok->nonzerostate = A->nonzerostate;
438c3ff71bSJunchao Zhang     A->spptr             = aijkok;
448c3ff71bSJunchao Zhang   } else if (A->offloadmask == PETSC_OFFLOAD_CPU) { /* Copy values only */
458c3ff71bSJunchao Zhang     Kokkos::deep_copy(aijkok->a_d,aijkok->a_h);
468c3ff71bSJunchao Zhang   }
478c3ff71bSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
488c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
498c3ff71bSJunchao Zhang }
508c3ff71bSJunchao Zhang 
51*a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy
52*a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure *h_mat)
53*a587d139SMark {
54*a587d139SMark   Mat_SeqAIJKokkos *aijkok;
55*a587d139SMark   Kokkos::View<PetscSplitCSRDataStructure, Kokkos::HostSpace> h_mat_k(h_mat);
56*a587d139SMark 
57*a587d139SMark   PetscFunctionBegin;
58*a587d139SMark   // ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
59*a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
60*a587d139SMark   if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos");
61*a587d139SMark   aijkok->device_mat_d = create_mirror(DeviceMemorySpace(),h_mat_k);
62*a587d139SMark   Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k);
63*a587d139SMark   PetscFunctionReturn(0);
64*a587d139SMark }
65*a587d139SMark 
66*a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL
67*a587d139SMark PETSC_EXTERN PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure **d_mat)
68*a587d139SMark {
69*a587d139SMark   Mat_SeqAIJKokkos *aijkok;
70*a587d139SMark 
71*a587d139SMark   PetscFunctionBegin;
72*a587d139SMark   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
73*a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
74*a587d139SMark     *d_mat = aijkok->device_mat_d.data();
75*a587d139SMark   } else {
76*a587d139SMark     PetscErrorCode   ierr;
77*a587d139SMark     ierr    = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it)
78*a587d139SMark     *d_mat  = NULL;
79*a587d139SMark   }
80*a587d139SMark   PetscFunctionReturn(0);
81*a587d139SMark }
82*a587d139SMark 
838c3ff71bSJunchao Zhang /* y = A x */
848c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
858c3ff71bSJunchao Zhang {
868c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
878c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
888c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
898c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
908c3ff71bSJunchao Zhang 
918c3ff71bSJunchao Zhang   PetscFunctionBegin;
928c3ff71bSJunchao Zhang   ierr   = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
938c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
948c3ff71bSJunchao Zhang   ierr   = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
958c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
968c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */
978c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
988c3ff71bSJunchao Zhang   ierr   = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
991e566348SJunchao Zhang   /* 2.0*aijkok->csr.nnz()-aijkok->csr.numRows() seems more accurate here but assumes there are no zero-rows. So a little sloopy here. */
100bb2d6e60SJunchao Zhang   ierr   = WaitForKokkos();CHKERRQ(ierr);
1011e566348SJunchao Zhang   ierr   = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1028c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1038c3ff71bSJunchao Zhang }
1048c3ff71bSJunchao Zhang 
1058c3ff71bSJunchao Zhang /* y = A^T x */
1068c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1078c3ff71bSJunchao Zhang {
1088c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1098c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1108c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1118c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1128c3ff71bSJunchao Zhang 
1138c3ff71bSJunchao Zhang   PetscFunctionBegin;
1148c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1158c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1168c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1178c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1188c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */
1198c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1208c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
121bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1221e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1238c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1248c3ff71bSJunchao Zhang }
1258c3ff71bSJunchao Zhang 
1268c3ff71bSJunchao Zhang /* y = A^H x */
1278c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)
1288c3ff71bSJunchao Zhang {
1298c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1308c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1318c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv;
1328c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          yv;
1338c3ff71bSJunchao Zhang 
1348c3ff71bSJunchao Zhang   PetscFunctionBegin;
1358c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1368c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1378c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(yy,&yv);CHKERRQ(ierr);
1388c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1398c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */
1408c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1418c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(yy,&yv);CHKERRQ(ierr);
142bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1431e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1458c3ff71bSJunchao Zhang }
1468c3ff71bSJunchao Zhang 
1478c3ff71bSJunchao Zhang /* z = A x + y */
1488c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz)
1498c3ff71bSJunchao Zhang {
1508c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1518c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1528c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1538c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1548c3ff71bSJunchao Zhang 
1558c3ff71bSJunchao Zhang   PetscFunctionBegin;
1568c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1578c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1588c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1598c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1608c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1618c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1628c3ff71bSJunchao Zhang   KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */
1638c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1648c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1658c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
166bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1671e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1688c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1698c3ff71bSJunchao Zhang }
1708c3ff71bSJunchao Zhang 
1718c3ff71bSJunchao Zhang /* z = A^T x + y */
1728c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
1738c3ff71bSJunchao Zhang {
1748c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1758c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
1768c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
1778c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
1788c3ff71bSJunchao Zhang 
1798c3ff71bSJunchao Zhang   PetscFunctionBegin;
1808c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
1818c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1828c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1838c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
1848c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
1858c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
1868c3ff71bSJunchao Zhang   KokkosSparse::spmv("T",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */
1878c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
1888c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
1898c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
190bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
1911e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
1928c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1938c3ff71bSJunchao Zhang }
1948c3ff71bSJunchao Zhang 
1958c3ff71bSJunchao Zhang /* z = A^H x + y */
1968c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)
1978c3ff71bSJunchao Zhang {
1988c3ff71bSJunchao Zhang   PetscErrorCode                   ierr;
1998c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos                 *aijkok;
2008c3ff71bSJunchao Zhang   ConstPetscScalarViewDevice_t     xv,yv;
2018c3ff71bSJunchao Zhang   PetscScalarViewDevice_t          zv;
2028c3ff71bSJunchao Zhang 
2038c3ff71bSJunchao Zhang   PetscFunctionBegin;
2048c3ff71bSJunchao Zhang   ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr);
2058c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2068c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2078c3ff71bSJunchao Zhang   ierr = VecKokkosGetDeviceView(zz,&zv);CHKERRQ(ierr);
2088c3ff71bSJunchao Zhang   if (zz != yy) Kokkos::deep_copy(zv,yv);
2098c3ff71bSJunchao Zhang   aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2108c3ff71bSJunchao Zhang   KokkosSparse::spmv("C",1.0/*alpha*/,aijkok->csr,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */
2118c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(xx,&xv);CHKERRQ(ierr);
2128c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceViewRead(yy,&yv);CHKERRQ(ierr);
2138c3ff71bSJunchao Zhang   ierr = VecKokkosRestoreDeviceView(zz,&zv);CHKERRQ(ierr);
214bb2d6e60SJunchao Zhang   ierr = WaitForKokkos();CHKERRQ(ierr);
2151e566348SJunchao Zhang   ierr = PetscLogGpuFlops(2.0*aijkok->csr.nnz());CHKERRQ(ierr);
2168c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2178c3ff71bSJunchao Zhang }
2188c3ff71bSJunchao Zhang 
2193d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2208c3ff71bSJunchao Zhang {
2218c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
2228c3ff71bSJunchao Zhang   Mat              B;
223c58ef05eSStefano Zampini   Mat_SeqAIJ       *aij;
2248c3ff71bSJunchao Zhang 
2258c3ff71bSJunchao Zhang   PetscFunctionBegin;
2268c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) { /* Build a new mat */
2278c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2288c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
2298c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2308c3ff71bSJunchao Zhang   }
2318c3ff71bSJunchao Zhang 
2328c3ff71bSJunchao Zhang   B    = *newmat;
2338c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2348c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
2358c3ff71bSJunchao Zhang   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2360d8bce8aSStefano Zampini   ierr = MatSetOps_SeqAIJKokkos(B);CHKERRQ(ierr);
237c58ef05eSStefano Zampini   /* TODO: see ViennaCL and CUSPARSE once we have a BindToCPU? */
238c58ef05eSStefano Zampini   aij  = (Mat_SeqAIJ*)B->data;
239c58ef05eSStefano Zampini   aij->inode.use = PETSC_FALSE;
2408c3ff71bSJunchao Zhang 
2418c3ff71bSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2428c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2438c3ff71bSJunchao Zhang }
2448c3ff71bSJunchao Zhang 
2458c3ff71bSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption cpvalues,Mat *B)
2468c3ff71bSJunchao Zhang {
2478c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2488c3ff71bSJunchao Zhang 
2498c3ff71bSJunchao Zhang   PetscFunctionBegin;
2508c3ff71bSJunchao Zhang   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2518c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(*B,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2528c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2538c3ff71bSJunchao Zhang }
2548c3ff71bSJunchao Zhang 
2558c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
2568c3ff71bSJunchao Zhang {
2578c3ff71bSJunchao Zhang   PetscErrorCode        ierr;
2588c3ff71bSJunchao Zhang   Mat_SeqAIJKokkos      *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
2598c3ff71bSJunchao Zhang 
2608c3ff71bSJunchao Zhang   PetscFunctionBegin;
261*a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
262*a587d139SMark     delete aijkok->colmap_d;
263*a587d139SMark     delete aijkok->i_uncompressed_d;
264*a587d139SMark   }
2658c3ff71bSJunchao Zhang   delete aijkok;
2668c3ff71bSJunchao Zhang   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2678c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2688c3ff71bSJunchao Zhang }
2698c3ff71bSJunchao Zhang 
2708c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
2718c3ff71bSJunchao Zhang {
2728c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2738c3ff71bSJunchao Zhang 
2748c3ff71bSJunchao Zhang   PetscFunctionBegin;
2758c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
2768c3ff71bSJunchao Zhang   ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr);
2778c3ff71bSJunchao Zhang   ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
2788c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2798c3ff71bSJunchao Zhang }
2808c3ff71bSJunchao Zhang 
281*a587d139SMark static PetscErrorCode MatSetValues_SeqAIJKokkos(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
282*a587d139SMark {
283*a587d139SMark   PetscErrorCode    ierr;
284*a587d139SMark   Mat_SeqAIJKokkos  *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
285*a587d139SMark   PetscFunctionBegin;
286*a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
287*a587d139SMark     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Mixing GPU and non-GPU assembly not supported");
288*a587d139SMark   }
289*a587d139SMark   ierr = MatSetValues_SeqAIJ(A,m,im,n,in,v,is);CHKERRQ(ierr);
290*a587d139SMark   PetscFunctionReturn(0);
291*a587d139SMark }
292*a587d139SMark 
293*a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
294*a587d139SMark {
295*a587d139SMark   PetscErrorCode             ierr;
296*a587d139SMark   PetscBool                  both = PETSC_FALSE;
297*a587d139SMark   Mat_SeqAIJKokkos           *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
298*a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
299*a587d139SMark 
300*a587d139SMark   PetscFunctionBegin;
301*a587d139SMark   if (aijkok && aijkok->a_d.data()) {
302*a587d139SMark     Kokkos::parallel_for (aijkok->a_d.size(), KOKKOS_LAMBDA (int i) { aijkok->a_d(i) = 0; });
303*a587d139SMark     both = PETSC_TRUE;
304*a587d139SMark   }
305*a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
306*a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
307*a587d139SMark   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
308*a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
309*a587d139SMark 
310*a587d139SMark   PetscFunctionReturn(0);
311*a587d139SMark }
312*a587d139SMark 
313*a587d139SMark static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
314*a587d139SMark {
315*a587d139SMark   PetscErrorCode ierr;
316*a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
317*a587d139SMark   PetscBool      flgx,flgy;
318*a587d139SMark 
319*a587d139SMark   PetscFunctionBegin;
320*a587d139SMark   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
321*a587d139SMark   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
322*a587d139SMark   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
323*a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJKOKKOS,&flgy);CHKERRQ(ierr);
324*a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJKOKKOS,&flgx);CHKERRQ(ierr);
325*a587d139SMark   if (!flgx || !flgy) {
326*a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
327*a587d139SMark     PetscFunctionReturn(0);
328*a587d139SMark   }
329*a587d139SMark   if (str == DIFFERENT_NONZERO_PATTERN) {
330*a587d139SMark     if (x->nz == y->nz) {
331*a587d139SMark       PetscBool e;
332*a587d139SMark       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
333*a587d139SMark       if (e) {
334*a587d139SMark         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
335*a587d139SMark         if (e) {
336*a587d139SMark           str = SAME_NONZERO_PATTERN;
337*a587d139SMark         }
338*a587d139SMark       }
339*a587d139SMark     }
340*a587d139SMark   }
341*a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
342*a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
343*a587d139SMark     PetscFunctionReturn(0);
344*a587d139SMark   } else {
345*a587d139SMark     if (Y->offloadmask==PETSC_OFFLOAD_CPU && X->offloadmask==PETSC_OFFLOAD_CPU) {
346*a587d139SMark       ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
347*a587d139SMark       PetscFunctionReturn(0);
348*a587d139SMark     } else if ((Y->offloadmask==PETSC_OFFLOAD_GPU || Y->offloadmask==PETSC_OFFLOAD_BOTH) && X->offloadmask == PETSC_OFFLOAD_CPU) {
349*a587d139SMark       ierr    = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr);
350*a587d139SMark     } else if ((X->offloadmask==PETSC_OFFLOAD_GPU || X->offloadmask==PETSC_OFFLOAD_BOTH) && Y->offloadmask == PETSC_OFFLOAD_CPU) {
351*a587d139SMark       ierr    = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); /* promote Y */
352*a587d139SMark     }
353*a587d139SMark     {
354*a587d139SMark       Mat_SeqAIJKokkos *aijkokY = static_cast<Mat_SeqAIJKokkos*>(Y->spptr);
355*a587d139SMark       Mat_SeqAIJKokkos *aijkokX = static_cast<Mat_SeqAIJKokkos*>(X->spptr);
356*a587d139SMark       if (aijkokY && aijkokX && aijkokY->a_d.data() && aijkokX->a_d.data()) {
357*a587d139SMark         Kokkos::parallel_for (aijkokY->a_d.size(), KOKKOS_LAMBDA (int i) { aijkokY->a_d(i) += a*aijkokX->a_d(i); });
358*a587d139SMark         Kokkos::deep_copy(aijkokY->a_h,aijkokY->a_d); // we could not copy and keep GPU
359*a587d139SMark         Y->offloadmask = PETSC_OFFLOAD_BOTH;
360*a587d139SMark         ierr = PetscLogGpuFlops(2.0*aijkokY->a_d.size());CHKERRQ(ierr);
361*a587d139SMark       } else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"no Mat_SeqAIJKokkos ???");
362*a587d139SMark     }
363*a587d139SMark   }
364*a587d139SMark   PetscFunctionReturn(0);
365*a587d139SMark }
366*a587d139SMark 
3678c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
3688c3ff71bSJunchao Zhang {
3698c3ff71bSJunchao Zhang   PetscFunctionBegin;
370*a587d139SMark   A->ops->setvalues                 = MatSetValues_SeqAIJKokkos; /* protect with DEBUG, but MatSeqAIJSetTotalPreallocation defeats this ??? */
3718c3ff71bSJunchao Zhang   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
3728c3ff71bSJunchao Zhang   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
3738c3ff71bSJunchao Zhang   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
3748c3ff71bSJunchao Zhang 
375*a587d139SMark   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
376*a587d139SMark   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
3778c3ff71bSJunchao Zhang   A->ops->mult                      = MatMult_SeqAIJKokkos;
3788c3ff71bSJunchao Zhang   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
3798c3ff71bSJunchao Zhang   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
3808c3ff71bSJunchao Zhang   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
3818c3ff71bSJunchao Zhang   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
3828c3ff71bSJunchao Zhang   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
3838c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
3848c3ff71bSJunchao Zhang }
3858c3ff71bSJunchao Zhang 
3868c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/
3878c3ff71bSJunchao Zhang /*C
3888c3ff71bSJunchao Zhang    MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
3898c3ff71bSJunchao Zhang    (the default parallel PETSc format). This matrix will ultimately be handled by
3908c3ff71bSJunchao Zhang    Kokkos for calculations. For good matrix
3918c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
3928c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
3938c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
3948c3ff71bSJunchao Zhang 
3958c3ff71bSJunchao Zhang    Collective
3968c3ff71bSJunchao Zhang 
3978c3ff71bSJunchao Zhang    Input Parameters:
3988c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
3998c3ff71bSJunchao Zhang .  m - number of rows
4008c3ff71bSJunchao Zhang .  n - number of columns
4018c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
4028c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
4038c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
4048c3ff71bSJunchao Zhang 
4058c3ff71bSJunchao Zhang    Output Parameter:
4068c3ff71bSJunchao Zhang .  A - the matrix
4078c3ff71bSJunchao Zhang 
4088c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
4098c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradgm instead of this routine directly.
4108c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
4118c3ff71bSJunchao Zhang 
4128c3ff71bSJunchao Zhang    Notes:
4138c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
4148c3ff71bSJunchao Zhang 
4158c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
4168c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
4178c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
4188c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
4198c3ff71bSJunchao Zhang 
4208c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
4218c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
4228c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
4238c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
4248c3ff71bSJunchao Zhang 
4258c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
4268c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
4278c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
4288c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
4298c3ff71bSJunchao Zhang 
4308c3ff71bSJunchao Zhang    Level: intermediate
4318c3ff71bSJunchao Zhang 
4328c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSeqAIJKokkos, MATAIJKOKKOS
4338c3ff71bSJunchao Zhang @*/
4348c3ff71bSJunchao Zhang PetscErrorCode  MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
4358c3ff71bSJunchao Zhang {
4368c3ff71bSJunchao Zhang   PetscErrorCode ierr;
4378c3ff71bSJunchao Zhang 
4388c3ff71bSJunchao Zhang   PetscFunctionBegin;
4398c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
4408c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
4418c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
4428c3ff71bSJunchao Zhang   ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
4438c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
4448c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
4458c3ff71bSJunchao Zhang }
446