xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision 11d22bbf946b46f9abb4f64d9e62ca8b9837c611)
1*11d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
28c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h>
3a587d139SMark #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
4*11d22bbfSJunchao Zhang 
58c3ff71bSJunchao Zhang 
68c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
78c3ff71bSJunchao Zhang {
88c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
98c3ff71bSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
10a587d139SMark   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
118c3ff71bSJunchao Zhang 
128c3ff71bSJunchao Zhang   PetscFunctionBegin;
138c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
148c3ff71bSJunchao Zhang   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
158c3ff71bSJunchao Zhang     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
168c3ff71bSJunchao Zhang   }
17a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
18a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
19a587d139SMark   }
20a587d139SMark 
218c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
228c3ff71bSJunchao Zhang }
238c3ff71bSJunchao Zhang 
248c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
258c3ff71bSJunchao Zhang {
268c3ff71bSJunchao Zhang   PetscErrorCode ierr;
278c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
288c3ff71bSJunchao Zhang 
298c3ff71bSJunchao Zhang   PetscFunctionBegin;
308c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
318c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
326a29ce69SStefano Zampini #if defined(PETSC_USE_DEBUG)
338c3ff71bSJunchao Zhang   if (d_nnz) {
346a29ce69SStefano Zampini     PetscInt i;
358c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
368c3ff71bSJunchao Zhang       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
378c3ff71bSJunchao Zhang     }
388c3ff71bSJunchao Zhang   }
398c3ff71bSJunchao Zhang   if (o_nnz) {
406a29ce69SStefano Zampini     PetscInt i;
418c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
428c3ff71bSJunchao Zhang       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
438c3ff71bSJunchao Zhang     }
448c3ff71bSJunchao Zhang   }
456a29ce69SStefano Zampini #endif
466a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
476a29ce69SStefano Zampini   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
486a29ce69SStefano Zampini #else
496a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
506a29ce69SStefano Zampini #endif
516a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
526a29ce69SStefano Zampini   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
536a29ce69SStefano Zampini   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
546a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
556a29ce69SStefano Zampini   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
566a29ce69SStefano Zampini 
576a29ce69SStefano Zampini   if (!mpiaij->A) {
588c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
598c3ff71bSJunchao Zhang     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
608c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
616a29ce69SStefano Zampini   }
626a29ce69SStefano Zampini   if (!mpiaij->B) {
636a29ce69SStefano Zampini     PetscMPIInt size;
6455b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
658c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
666a29ce69SStefano Zampini     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
678c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
688c3ff71bSJunchao Zhang   }
696a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
706a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
718c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
728c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
738c3ff71bSJunchao Zhang   mat->preallocated = PETSC_TRUE;
748c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
758c3ff71bSJunchao Zhang }
768c3ff71bSJunchao Zhang 
778c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
788c3ff71bSJunchao Zhang {
798c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
808c3ff71bSJunchao Zhang   PetscErrorCode ierr;
818c3ff71bSJunchao Zhang   PetscInt       nt;
828c3ff71bSJunchao Zhang 
838c3ff71bSJunchao Zhang   PetscFunctionBegin;
848c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
858c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
868c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
878c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
888c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
898c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
908c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
918c3ff71bSJunchao Zhang }
928c3ff71bSJunchao Zhang 
938c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
948c3ff71bSJunchao Zhang {
958c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
968c3ff71bSJunchao Zhang   PetscErrorCode ierr;
978c3ff71bSJunchao Zhang   PetscInt       nt;
988c3ff71bSJunchao Zhang 
998c3ff71bSJunchao Zhang   PetscFunctionBegin;
1008c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1018c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
1028c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1038c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
1048c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1058c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
1068c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1078c3ff71bSJunchao Zhang }
1088c3ff71bSJunchao Zhang 
1098c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
1108c3ff71bSJunchao Zhang {
1118c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
1128c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1138c3ff71bSJunchao Zhang   PetscInt       nt;
1148c3ff71bSJunchao Zhang 
1158c3ff71bSJunchao Zhang   PetscFunctionBegin;
1168c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1178c3ff71bSJunchao Zhang   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
1188c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
1198c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
1208c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1228c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1238c3ff71bSJunchao Zhang }
1248c3ff71bSJunchao Zhang 
1258c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1268c3ff71bSJunchao Zhang {
1278c3ff71bSJunchao Zhang   PetscErrorCode     ierr;
1288c3ff71bSJunchao Zhang   Mat                B;
1298c3ff71bSJunchao Zhang 
1308c3ff71bSJunchao Zhang   PetscFunctionBegin;
1318c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
1328c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1338c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
1348c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1358c3ff71bSJunchao Zhang   }
1368c3ff71bSJunchao Zhang   B = *newmat;
1378c3ff71bSJunchao Zhang 
1386f3d89d0SStefano Zampini   B->boundtocpu = PETSC_FALSE;
1398c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1408c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
1413d0639e7SStefano Zampini   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1428c3ff71bSJunchao Zhang 
1438c3ff71bSJunchao Zhang   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1448c3ff71bSJunchao Zhang   B->ops->mult                  = MatMult_MPIAIJKokkos;
1458c3ff71bSJunchao Zhang   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1468c3ff71bSJunchao Zhang   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1474e84afc0SStefano Zampini   // Needs an efficient implementation of the COO preallocation routines
1484e84afc0SStefano Zampini   //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
1498c3ff71bSJunchao Zhang 
1503d0639e7SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1518c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1528c3ff71bSJunchao Zhang }
1538c3ff71bSJunchao Zhang 
1548c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1558c3ff71bSJunchao Zhang {
1568c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1578c3ff71bSJunchao Zhang 
1588c3ff71bSJunchao Zhang   PetscFunctionBegin;
1598c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1608c3ff71bSJunchao Zhang   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1618c3ff71bSJunchao Zhang   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1628c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1638c3ff71bSJunchao Zhang }
1648c3ff71bSJunchao Zhang 
1658c3ff71bSJunchao Zhang /*@C
1668c3ff71bSJunchao Zhang    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1678c3ff71bSJunchao Zhang    (the default parallel PETSc format).  This matrix will ultimately pushed down
1688c3ff71bSJunchao Zhang    to Kokkos for calculations. For good matrix
1698c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
1708c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
1718c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
1728c3ff71bSJunchao Zhang 
1738c3ff71bSJunchao Zhang    Collective
1748c3ff71bSJunchao Zhang 
1758c3ff71bSJunchao Zhang    Input Parameters:
1768c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
1778c3ff71bSJunchao Zhang .  m - number of rows
1788c3ff71bSJunchao Zhang .  n - number of columns
1798c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
1808c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
1818c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
1828c3ff71bSJunchao Zhang 
1838c3ff71bSJunchao Zhang    Output Parameter:
1848c3ff71bSJunchao Zhang .  A - the matrix
1858c3ff71bSJunchao Zhang 
1868c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1878c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1888c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1898c3ff71bSJunchao Zhang 
1908c3ff71bSJunchao Zhang    Notes:
1918c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
1928c3ff71bSJunchao Zhang 
1938c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
1948c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
1958c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
1968c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
1978c3ff71bSJunchao Zhang 
1988c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
1998c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2008c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
2018c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
2028c3ff71bSJunchao Zhang 
2038c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
2048c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
2058c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
2068c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
2078c3ff71bSJunchao Zhang 
2088c3ff71bSJunchao Zhang    Level: intermediate
2098c3ff71bSJunchao Zhang 
2108c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
2118c3ff71bSJunchao Zhang @*/
2128c3ff71bSJunchao Zhang PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
2138c3ff71bSJunchao Zhang {
2148c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2158c3ff71bSJunchao Zhang   PetscMPIInt    size;
2168c3ff71bSJunchao Zhang 
2178c3ff71bSJunchao Zhang   PetscFunctionBegin;
2188c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2198c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
220ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
2218c3ff71bSJunchao Zhang   if (size > 1) {
2228c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
2238c3ff71bSJunchao Zhang     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2248c3ff71bSJunchao Zhang   } else {
2258c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2268c3ff71bSJunchao Zhang     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2278c3ff71bSJunchao Zhang   }
2288c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2298c3ff71bSJunchao Zhang }
2308c3ff71bSJunchao Zhang 
231a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
232a587d139SMark PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
233a587d139SMark {
234a587d139SMark   #if defined(PETSC_USE_CTABLE)
235a587d139SMark   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
236a587d139SMark   #else
237a587d139SMark   PetscMPIInt                size,rank;
238a587d139SMark   MPI_Comm                   comm;
239a587d139SMark   PetscErrorCode             ierr;
240a587d139SMark   PetscSplitCSRDataStructure *d_mat=NULL;
241a587d139SMark 
242a587d139SMark   PetscFunctionBegin;
243a587d139SMark   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
24455b25c41SPierre Jolivet   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
24555b25c41SPierre Jolivet   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
246a587d139SMark   if (size == 1) {
247a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
248a587d139SMark   } else {
249a587d139SMark     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
250a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
251a587d139SMark   }
252a587d139SMark   // act like MatSetValues because not called on host
253a587d139SMark   if (A->assembled) {
254a587d139SMark     if (A->was_assembled) {
255a587d139SMark       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
256a587d139SMark     }
257a587d139SMark     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
258a587d139SMark   } else {
259a587d139SMark     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
260a587d139SMark     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
261a587d139SMark   }
262a587d139SMark   if (!d_mat) {
263a587d139SMark     PetscSplitCSRDataStructure  h_mat; /* host container */
264a587d139SMark     Mat_SeqAIJKokkos            *aijkokA;
265a587d139SMark     Mat_SeqAIJ                  *jaca;
266a587d139SMark     PetscInt                    n = A->rmap->n, nnz;
267a587d139SMark     Mat                         Amat;
268a587d139SMark     // create and copy
269a587d139SMark     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
270a587d139SMark     if (size == 1) {
271a587d139SMark       Amat = A;
272a587d139SMark       jaca = (Mat_SeqAIJ*)A->data;
273a587d139SMark       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
274a587d139SMark       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
275a587d139SMark       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
276a587d139SMark       h_mat.offdiag.a = NULL;
277a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
278a587d139SMark       aijkokA->i_uncompressed_d = NULL;
279a587d139SMark       aijkokA->colmap_d = NULL;
280a587d139SMark     } else {
281a587d139SMark       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
282a587d139SMark       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
283a587d139SMark       PetscInt         ii;
284a587d139SMark       Mat_SeqAIJKokkos *aijkokB;
285a587d139SMark       Amat = aij->A;
286a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
287a587d139SMark       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
288a587d139SMark       aijkokA->i_uncompressed_d = NULL;
289a587d139SMark       aijkokA->colmap_d = NULL;
290a587d139SMark       jaca = (Mat_SeqAIJ*)aij->A->data;
291a587d139SMark       if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
292a587d139SMark       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
293a587d139SMark       // create colmap - this is ussually done (lazy) in MatSetValues
294a587d139SMark       aij->donotstash = PETSC_TRUE;
295a587d139SMark       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
296a587d139SMark       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
297a587d139SMark       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
298a587d139SMark       aij->colmap[A->cmap->N] = -9;
299a587d139SMark       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
300a587d139SMark       for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
301a587d139SMark       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
302a587d139SMark       // allocate B copy data
303a587d139SMark       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
304a587d139SMark       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
305a587d139SMark       nnz = jacb->i[n];
306a587d139SMark       if (jacb->compressedrow.use) {
307a587d139SMark         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
308a587d139SMark         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k));
309a587d139SMark         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
310a587d139SMark         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
311a587d139SMark         //err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
312a587d139SMark         //err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
313a587d139SMark       } else {
314a587d139SMark          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
315a587d139SMark       }
316a587d139SMark       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
317a587d139SMark       h_mat.offdiag.a = aijkokB->a_d.data();
318a587d139SMark       {
319a587d139SMark         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N);
320a587d139SMark         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k));
321a587d139SMark         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
322a587d139SMark         h_mat.colmap = aijkokB->colmap_d->data();
323a587d139SMark       }
324a587d139SMark       //err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
325a587d139SMark       //err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
326a587d139SMark       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
327a587d139SMark       h_mat.offdiag.n = n;
328a587d139SMark     }
329a587d139SMark     // allocate A copy data
330a587d139SMark     nnz = jaca->i[n];
331a587d139SMark     h_mat.diag.n = n;
332a587d139SMark     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
33355b25c41SPierre Jolivet     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
334a587d139SMark     if (jaca->compressedrow.use) {
335a587d139SMark       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
336a587d139SMark     } else {
337a587d139SMark       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
338a587d139SMark     }
339a587d139SMark     //h_mat.diag.j = aj;
340a587d139SMark     //h_mat.diag.a = aa;
341a587d139SMark     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
342a587d139SMark     h_mat.diag.a = aijkokA->a_d.data();
343a587d139SMark     // copy pointers and metdata to device
344a587d139SMark     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
345a587d139SMark     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
346a587d139SMark     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
347a587d139SMark   }
348a587d139SMark   *B = d_mat; // return it, set it in Mat, and set it up
349a587d139SMark   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
350a587d139SMark   PetscFunctionReturn(0);
351a587d139SMark   #endif
352a587d139SMark }
353