xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision a587d1398bd356a7db20a09283619c724e82d622)
18c3ff71bSJunchao Zhang #include <petscconf.h>
28c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h>
3*a587d139SMark #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4*a587d139SMark #include <petscveckokkos.hpp>
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
78c3ff71bSJunchao Zhang {
88c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
98c3ff71bSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
10*a587d139SMark   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
118c3ff71bSJunchao Zhang 
128c3ff71bSJunchao Zhang   PetscFunctionBegin;
138c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
148c3ff71bSJunchao Zhang   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
158c3ff71bSJunchao Zhang     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
168c3ff71bSJunchao Zhang   }
17*a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
18*a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19*a587d139SMark   }
20*a587d139SMark 
218c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
228c3ff71bSJunchao Zhang }
238c3ff71bSJunchao Zhang 
248c3ff71bSJunchao Zhang PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
258c3ff71bSJunchao Zhang {
268c3ff71bSJunchao Zhang   PetscErrorCode     ierr;
278c3ff71bSJunchao Zhang   PetscInt           i;
288c3ff71bSJunchao Zhang   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)mat->data;
298c3ff71bSJunchao Zhang 
308c3ff71bSJunchao Zhang   PetscFunctionBegin;
318c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
328c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
338c3ff71bSJunchao Zhang   if (d_nnz) {
348c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
358c3ff71bSJunchao Zhang       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
368c3ff71bSJunchao Zhang     }
378c3ff71bSJunchao Zhang   }
388c3ff71bSJunchao Zhang   if (o_nnz) {
398c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
408c3ff71bSJunchao Zhang       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
418c3ff71bSJunchao Zhang     }
428c3ff71bSJunchao Zhang   }
438c3ff71bSJunchao Zhang   if (!mat->preallocated) {
448c3ff71bSJunchao Zhang     /* Explicitly create 2 MATSEQAIJKOKKOS matrices. */
458c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
468c3ff71bSJunchao Zhang     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
478c3ff71bSJunchao Zhang     ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
488c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
498c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
508c3ff71bSJunchao Zhang     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,mat->cmap->N,mat->rmap->n,mat->cmap->N);CHKERRQ(ierr);
518c3ff71bSJunchao Zhang     ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
528c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
538c3ff71bSJunchao Zhang   }
548c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
558c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
568c3ff71bSJunchao Zhang 
578c3ff71bSJunchao Zhang   mat->preallocated = PETSC_TRUE;
588c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
598c3ff71bSJunchao Zhang }
608c3ff71bSJunchao Zhang 
618c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
628c3ff71bSJunchao Zhang {
638c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
648c3ff71bSJunchao Zhang   PetscErrorCode ierr;
658c3ff71bSJunchao Zhang   PetscInt       nt;
668c3ff71bSJunchao Zhang 
678c3ff71bSJunchao Zhang   PetscFunctionBegin;
688c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
698c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
708c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
718c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
728c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
738c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
748c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
758c3ff71bSJunchao Zhang }
768c3ff71bSJunchao Zhang 
778c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
788c3ff71bSJunchao Zhang {
798c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
808c3ff71bSJunchao Zhang   PetscErrorCode ierr;
818c3ff71bSJunchao Zhang   PetscInt       nt;
828c3ff71bSJunchao Zhang 
838c3ff71bSJunchao Zhang   PetscFunctionBegin;
848c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
858c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
868c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
878c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
888c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
898c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
918c3ff71bSJunchao Zhang }
928c3ff71bSJunchao Zhang 
938c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
948c3ff71bSJunchao Zhang {
958c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
968c3ff71bSJunchao Zhang   PetscErrorCode ierr;
978c3ff71bSJunchao Zhang   PetscInt       nt;
988c3ff71bSJunchao Zhang 
998c3ff71bSJunchao Zhang   PetscFunctionBegin;
1008c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1018c3ff71bSJunchao Zhang   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
1028c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
1038c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
1048c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1058c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1068c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1078c3ff71bSJunchao Zhang }
1088c3ff71bSJunchao Zhang 
1098c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1108c3ff71bSJunchao Zhang {
1118c3ff71bSJunchao Zhang   PetscErrorCode     ierr;
1128c3ff71bSJunchao Zhang   Mat                B;
1138c3ff71bSJunchao Zhang 
1148c3ff71bSJunchao Zhang   PetscFunctionBegin;
1158c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
1168c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1178c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
1188c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1198c3ff71bSJunchao Zhang   }
1208c3ff71bSJunchao Zhang   B = *newmat;
1218c3ff71bSJunchao Zhang 
1228c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1238c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
1243d0639e7SStefano Zampini   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1258c3ff71bSJunchao Zhang 
1268c3ff71bSJunchao Zhang   B->ops->assemblyend    = MatAssemblyEnd_MPIAIJKokkos;
1278c3ff71bSJunchao Zhang   B->ops->mult           = MatMult_MPIAIJKokkos;
1288c3ff71bSJunchao Zhang   B->ops->multadd        = MatMultAdd_MPIAIJKokkos;
1298c3ff71bSJunchao Zhang   B->ops->multtranspose  = MatMultTranspose_MPIAIJKokkos;
1308c3ff71bSJunchao Zhang 
1313d0639e7SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1328c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1338c3ff71bSJunchao Zhang }
1348c3ff71bSJunchao Zhang 
1358c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1368c3ff71bSJunchao Zhang {
1378c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1388c3ff71bSJunchao Zhang 
1398c3ff71bSJunchao Zhang   PetscFunctionBegin;
1408c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1418c3ff71bSJunchao Zhang   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1428c3ff71bSJunchao Zhang   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1438c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1448c3ff71bSJunchao Zhang }
1458c3ff71bSJunchao Zhang 
1468c3ff71bSJunchao Zhang /*@C
1478c3ff71bSJunchao Zhang    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1488c3ff71bSJunchao Zhang    (the default parallel PETSc format).  This matrix will ultimately pushed down
1498c3ff71bSJunchao Zhang    to Kokkos for calculations. For good matrix
1508c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
1518c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
1528c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
1538c3ff71bSJunchao Zhang 
1548c3ff71bSJunchao Zhang    Collective
1558c3ff71bSJunchao Zhang 
1568c3ff71bSJunchao Zhang    Input Parameters:
1578c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
1588c3ff71bSJunchao Zhang .  m - number of rows
1598c3ff71bSJunchao Zhang .  n - number of columns
1608c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
1618c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
1628c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
1638c3ff71bSJunchao Zhang 
1648c3ff71bSJunchao Zhang    Output Parameter:
1658c3ff71bSJunchao Zhang .  A - the matrix
1668c3ff71bSJunchao Zhang 
1678c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1688c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1698c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1708c3ff71bSJunchao Zhang 
1718c3ff71bSJunchao Zhang    Notes:
1728c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
1738c3ff71bSJunchao Zhang 
1748c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
1758c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
1768c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
1778c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
1788c3ff71bSJunchao Zhang 
1798c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
1808c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1818c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
1828c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
1838c3ff71bSJunchao Zhang 
1848c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
1858c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
1868c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
1878c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
1888c3ff71bSJunchao Zhang 
1898c3ff71bSJunchao Zhang    Level: intermediate
1908c3ff71bSJunchao Zhang 
1918c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
1928c3ff71bSJunchao Zhang @*/
1938c3ff71bSJunchao Zhang PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
1948c3ff71bSJunchao Zhang {
1958c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1968c3ff71bSJunchao Zhang   PetscMPIInt    size;
1978c3ff71bSJunchao Zhang 
1988c3ff71bSJunchao Zhang   PetscFunctionBegin;
1998c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2008c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
2018c3ff71bSJunchao Zhang   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2028c3ff71bSJunchao Zhang   if (size > 1) {
2038c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
2048c3ff71bSJunchao Zhang     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2058c3ff71bSJunchao Zhang   } else {
2068c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2078c3ff71bSJunchao Zhang     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2088c3ff71bSJunchao Zhang   }
2098c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2108c3ff71bSJunchao Zhang }
2118c3ff71bSJunchao Zhang 
212*a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
213*a587d139SMark PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
214*a587d139SMark {
215*a587d139SMark   #if defined(PETSC_USE_CTABLE)
216*a587d139SMark   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
217*a587d139SMark   #else
218*a587d139SMark   PetscMPIInt                size,rank;
219*a587d139SMark   MPI_Comm                   comm;
220*a587d139SMark   PetscErrorCode             ierr;
221*a587d139SMark   PetscSplitCSRDataStructure *d_mat=NULL;
222*a587d139SMark 
223*a587d139SMark   PetscFunctionBegin;
224*a587d139SMark   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
225*a587d139SMark   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
226*a587d139SMark   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
227*a587d139SMark   if (size == 1) {
228*a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
229*a587d139SMark   } else {
230*a587d139SMark     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
231*a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
232*a587d139SMark   }
233*a587d139SMark   // act like MatSetValues because not called on host
234*a587d139SMark   if (A->assembled) {
235*a587d139SMark     if (A->was_assembled) {
236*a587d139SMark       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
237*a587d139SMark     }
238*a587d139SMark     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
239*a587d139SMark   } else {
240*a587d139SMark     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
241*a587d139SMark     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
242*a587d139SMark   }
243*a587d139SMark   if (!d_mat) {
244*a587d139SMark     PetscSplitCSRDataStructure  h_mat; /* host container */
245*a587d139SMark     Mat_SeqAIJKokkos            *aijkokA;
246*a587d139SMark     Mat_SeqAIJ                  *jaca;
247*a587d139SMark     PetscInt                    n = A->rmap->n, nnz;
248*a587d139SMark     Mat                         Amat;
249*a587d139SMark     // create and copy
250*a587d139SMark     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
251*a587d139SMark     if (size == 1) {
252*a587d139SMark       Amat = A;
253*a587d139SMark       jaca = (Mat_SeqAIJ*)A->data;
254*a587d139SMark       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
255*a587d139SMark       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
256*a587d139SMark       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
257*a587d139SMark       h_mat.offdiag.a = NULL;
258*a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
259*a587d139SMark       aijkokA->i_uncompressed_d = NULL;
260*a587d139SMark       aijkokA->colmap_d = NULL;
261*a587d139SMark     } else {
262*a587d139SMark       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
263*a587d139SMark       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
264*a587d139SMark       PetscInt         ii;
265*a587d139SMark       Mat_SeqAIJKokkos *aijkokB;
266*a587d139SMark       Amat = aij->A;
267*a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
268*a587d139SMark       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
269*a587d139SMark       aijkokA->i_uncompressed_d = NULL;
270*a587d139SMark       aijkokA->colmap_d = NULL;
271*a587d139SMark       jaca = (Mat_SeqAIJ*)aij->A->data;
272*a587d139SMark       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
273*a587d139SMark       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
274*a587d139SMark       // create colmap - this is ussually done (lazy) in MatSetValues
275*a587d139SMark       aij->donotstash = PETSC_TRUE;
276*a587d139SMark       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
277*a587d139SMark       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
278*a587d139SMark       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
279*a587d139SMark       aij->colmap[A->cmap->N] = -9;
280*a587d139SMark       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
281*a587d139SMark       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
282*a587d139SMark       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
283*a587d139SMark       // allocate B copy data
284*a587d139SMark       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
285*a587d139SMark       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
286*a587d139SMark       nnz = jacb->i[n];
287*a587d139SMark       if (jacb->compressedrow.use) {
288*a587d139SMark         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
289*a587d139SMark         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k));
290*a587d139SMark         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
291*a587d139SMark         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
292*a587d139SMark         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
293*a587d139SMark         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
294*a587d139SMark       } else {
295*a587d139SMark          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
296*a587d139SMark       }
297*a587d139SMark       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
298*a587d139SMark       h_mat.offdiag.a = aijkokB->a_d.data();
299*a587d139SMark       {
300*a587d139SMark         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
301*a587d139SMark         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k));
302*a587d139SMark         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
303*a587d139SMark         h_mat.colmap = aijkokB->colmap_d->data();
304*a587d139SMark       }
305*a587d139SMark       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
306*a587d139SMark       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
307*a587d139SMark       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
308*a587d139SMark       h_mat.offdiag.n = n;
309*a587d139SMark     }
310*a587d139SMark     // allocate A copy data
311*a587d139SMark     nnz = jaca->i[n];
312*a587d139SMark     h_mat.diag.n = n;
313*a587d139SMark     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
314*a587d139SMark     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
315*a587d139SMark     if (jaca->compressedrow.use) {
316*a587d139SMark       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
317*a587d139SMark     } else {
318*a587d139SMark       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
319*a587d139SMark     }
320*a587d139SMark     //h_mat.diag.j = aj;
321*a587d139SMark     //h_mat.diag.a = aa;
322*a587d139SMark     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
323*a587d139SMark     h_mat.diag.a = aijkokA->a_d.data();
324*a587d139SMark     // copy pointers and metdata to device
325*a587d139SMark     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
326*a587d139SMark     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
327*a587d139SMark     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
328*a587d139SMark   }
329*a587d139SMark   *B = d_mat; // return it, set it in Mat, and set it up
330*a587d139SMark   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
331*a587d139SMark   PetscFunctionReturn(0);
332*a587d139SMark   #endif
333*a587d139SMark }
334