xref: /petsc/src/mat/impls/aij/mpi/kokkos/mpiaijkok.kokkos.cxx (revision a5b23f4acc7afc99d3844ebd5fb65a81c16e8b8c)
111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp>
28c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h>
3a587d139SMark #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp>
411d22bbfSJunchao Zhang 
58c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode)
68c3ff71bSJunchao Zhang {
78c3ff71bSJunchao Zhang   PetscErrorCode   ierr;
88c3ff71bSJunchao Zhang   Mat_MPIAIJ       *mpiaij = (Mat_MPIAIJ*)A->data;
9a587d139SMark   Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL;
108c3ff71bSJunchao Zhang 
118c3ff71bSJunchao Zhang   PetscFunctionBegin;
128c3ff71bSJunchao Zhang   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
138c3ff71bSJunchao Zhang   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
148c3ff71bSJunchao Zhang     ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr);
158c3ff71bSJunchao Zhang   }
16a587d139SMark   if (aijkok && aijkok->device_mat_d.data()) {
17a587d139SMark     A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this
18a587d139SMark   }
19a587d139SMark 
208c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
218c3ff71bSJunchao Zhang }
228c3ff71bSJunchao Zhang 
238c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
248c3ff71bSJunchao Zhang {
258c3ff71bSJunchao Zhang   PetscErrorCode ierr;
268c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
278c3ff71bSJunchao Zhang 
288c3ff71bSJunchao Zhang   PetscFunctionBegin;
298c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr);
308c3ff71bSJunchao Zhang   ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr);
316a29ce69SStefano Zampini #if defined(PETSC_USE_DEBUG)
328c3ff71bSJunchao Zhang   if (d_nnz) {
336a29ce69SStefano Zampini     PetscInt i;
348c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
358c3ff71bSJunchao Zhang       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
368c3ff71bSJunchao Zhang     }
378c3ff71bSJunchao Zhang   }
388c3ff71bSJunchao Zhang   if (o_nnz) {
396a29ce69SStefano Zampini     PetscInt i;
408c3ff71bSJunchao Zhang     for (i=0; i<mat->rmap->n; i++) {
418c3ff71bSJunchao Zhang       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
428c3ff71bSJunchao Zhang     }
438c3ff71bSJunchao Zhang   }
446a29ce69SStefano Zampini #endif
456a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
466a29ce69SStefano Zampini   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
476a29ce69SStefano Zampini #else
486a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
496a29ce69SStefano Zampini #endif
506a29ce69SStefano Zampini   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
516a29ce69SStefano Zampini   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
526a29ce69SStefano Zampini   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
536a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
546a29ce69SStefano Zampini   ierr = MatDestroy(&mpiaij->B);CHKERRQ(ierr);
556a29ce69SStefano Zampini 
566a29ce69SStefano Zampini   if (!mpiaij->A) {
578c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr);
588c3ff71bSJunchao Zhang     ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr);
598c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr);
606a29ce69SStefano Zampini   }
616a29ce69SStefano Zampini   if (!mpiaij->B) {
626a29ce69SStefano Zampini     PetscMPIInt size;
6355b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
648c3ff71bSJunchao Zhang     ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr);
656a29ce69SStefano Zampini     ierr = MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0);CHKERRQ(ierr);
668c3ff71bSJunchao Zhang     ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr);
678c3ff71bSJunchao Zhang   }
686a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
696a29ce69SStefano Zampini   ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr);
708c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr);
718c3ff71bSJunchao Zhang   ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr);
728c3ff71bSJunchao Zhang   mat->preallocated = PETSC_TRUE;
738c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
748c3ff71bSJunchao Zhang }
758c3ff71bSJunchao Zhang 
768c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
778c3ff71bSJunchao Zhang {
788c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
798c3ff71bSJunchao Zhang   PetscErrorCode ierr;
808c3ff71bSJunchao Zhang   PetscInt       nt;
818c3ff71bSJunchao Zhang 
828c3ff71bSJunchao Zhang   PetscFunctionBegin;
838c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
848c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
858c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
868c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr);
878c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
888c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr);
898c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
908c3ff71bSJunchao Zhang }
918c3ff71bSJunchao Zhang 
928c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz)
938c3ff71bSJunchao Zhang {
948c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
958c3ff71bSJunchao Zhang   PetscErrorCode ierr;
968c3ff71bSJunchao Zhang   PetscInt       nt;
978c3ff71bSJunchao Zhang 
988c3ff71bSJunchao Zhang   PetscFunctionBegin;
998c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1008c3ff71bSJunchao Zhang   if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt);
1018c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1028c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr);
1038c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1048c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr);
1058c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1068c3ff71bSJunchao Zhang }
1078c3ff71bSJunchao Zhang 
1088c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy)
1098c3ff71bSJunchao Zhang {
1108c3ff71bSJunchao Zhang   Mat_MPIAIJ     *mpiaij = (Mat_MPIAIJ*)mat->data;
1118c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1128c3ff71bSJunchao Zhang   PetscInt       nt;
1138c3ff71bSJunchao Zhang 
1148c3ff71bSJunchao Zhang   PetscFunctionBegin;
1158c3ff71bSJunchao Zhang   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1168c3ff71bSJunchao Zhang   if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt);
1178c3ff71bSJunchao Zhang   ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr);
1188c3ff71bSJunchao Zhang   ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr);
1198c3ff71bSJunchao Zhang   ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1208c3ff71bSJunchao Zhang   ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1218c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1228c3ff71bSJunchao Zhang }
1238c3ff71bSJunchao Zhang 
1248c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
1258c3ff71bSJunchao Zhang {
1268c3ff71bSJunchao Zhang   PetscErrorCode     ierr;
1278c3ff71bSJunchao Zhang   Mat                B;
1288c3ff71bSJunchao Zhang 
1298c3ff71bSJunchao Zhang   PetscFunctionBegin;
1308c3ff71bSJunchao Zhang   if (reuse == MAT_INITIAL_MATRIX) {
1318c3ff71bSJunchao Zhang     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
1328c3ff71bSJunchao Zhang   } else if (reuse == MAT_REUSE_MATRIX) {
1338c3ff71bSJunchao Zhang     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
1348c3ff71bSJunchao Zhang   }
1358c3ff71bSJunchao Zhang   B = *newmat;
1368c3ff71bSJunchao Zhang 
1376f3d89d0SStefano Zampini   B->boundtocpu = PETSC_FALSE;
1388c3ff71bSJunchao Zhang   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1398c3ff71bSJunchao Zhang   ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr);
1403d0639e7SStefano Zampini   ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr);
1418c3ff71bSJunchao Zhang 
1428c3ff71bSJunchao Zhang   B->ops->assemblyend           = MatAssemblyEnd_MPIAIJKokkos;
1438c3ff71bSJunchao Zhang   B->ops->mult                  = MatMult_MPIAIJKokkos;
1448c3ff71bSJunchao Zhang   B->ops->multadd               = MatMultAdd_MPIAIJKokkos;
1458c3ff71bSJunchao Zhang   B->ops->multtranspose         = MatMultTranspose_MPIAIJKokkos;
1464e84afc0SStefano Zampini   // Needs an efficient implementation of the COO preallocation routines
1474e84afc0SStefano Zampini   //B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
1488c3ff71bSJunchao Zhang 
1493d0639e7SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr);
1508c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1518c3ff71bSJunchao Zhang }
1528c3ff71bSJunchao Zhang 
1538c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A)
1548c3ff71bSJunchao Zhang {
1558c3ff71bSJunchao Zhang   PetscErrorCode ierr;
1568c3ff71bSJunchao Zhang 
1578c3ff71bSJunchao Zhang   PetscFunctionBegin;
1588c3ff71bSJunchao Zhang   ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr);
1598c3ff71bSJunchao Zhang   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
1608c3ff71bSJunchao Zhang   ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
1618c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
1628c3ff71bSJunchao Zhang }
1638c3ff71bSJunchao Zhang 
1648c3ff71bSJunchao Zhang /*@C
1658c3ff71bSJunchao Zhang    MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format
1668c3ff71bSJunchao Zhang    (the default parallel PETSc format).  This matrix will ultimately pushed down
1678c3ff71bSJunchao Zhang    to Kokkos for calculations. For good matrix
1688c3ff71bSJunchao Zhang    assembly performance the user should preallocate the matrix storage by setting
1698c3ff71bSJunchao Zhang    the parameter nz (or the array nnz).  By setting these parameters accurately,
1708c3ff71bSJunchao Zhang    performance during matrix assembly can be increased by more than a factor of 50.
1718c3ff71bSJunchao Zhang 
1728c3ff71bSJunchao Zhang    Collective
1738c3ff71bSJunchao Zhang 
1748c3ff71bSJunchao Zhang    Input Parameters:
1758c3ff71bSJunchao Zhang +  comm - MPI communicator, set to PETSC_COMM_SELF
1768c3ff71bSJunchao Zhang .  m - number of rows
1778c3ff71bSJunchao Zhang .  n - number of columns
1788c3ff71bSJunchao Zhang .  nz - number of nonzeros per row (same for all rows)
1798c3ff71bSJunchao Zhang -  nnz - array containing the number of nonzeros in the various rows
1808c3ff71bSJunchao Zhang          (possibly different for each row) or NULL
1818c3ff71bSJunchao Zhang 
1828c3ff71bSJunchao Zhang    Output Parameter:
1838c3ff71bSJunchao Zhang .  A - the matrix
1848c3ff71bSJunchao Zhang 
1858c3ff71bSJunchao Zhang    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1868c3ff71bSJunchao Zhang    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1878c3ff71bSJunchao Zhang    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1888c3ff71bSJunchao Zhang 
1898c3ff71bSJunchao Zhang    Notes:
1908c3ff71bSJunchao Zhang    If nnz is given then nz is ignored
1918c3ff71bSJunchao Zhang 
1928c3ff71bSJunchao Zhang    The AIJ format (also called the Yale sparse matrix format or
1938c3ff71bSJunchao Zhang    compressed row storage), is fully compatible with standard Fortran 77
1948c3ff71bSJunchao Zhang    storage.  That is, the stored row and column indices can begin at
1958c3ff71bSJunchao Zhang    either one (as in Fortran) or zero.  See the users' manual for details.
1968c3ff71bSJunchao Zhang 
1978c3ff71bSJunchao Zhang    Specify the preallocated storage with either nz or nnz (not both).
1988c3ff71bSJunchao Zhang    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1998c3ff71bSJunchao Zhang    allocation.  For large problems you MUST preallocate memory or you
2008c3ff71bSJunchao Zhang    will get TERRIBLE performance, see the users' manual chapter on matrices.
2018c3ff71bSJunchao Zhang 
2028c3ff71bSJunchao Zhang    By default, this format uses inodes (identical nodes) when possible, to
2038c3ff71bSJunchao Zhang    improve numerical efficiency of matrix-vector products and solves. We
2048c3ff71bSJunchao Zhang    search for consecutive rows with the same nonzero structure, thereby
2058c3ff71bSJunchao Zhang    reusing matrix information to achieve increased efficiency.
2068c3ff71bSJunchao Zhang 
2078c3ff71bSJunchao Zhang    Level: intermediate
2088c3ff71bSJunchao Zhang 
2098c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos
2108c3ff71bSJunchao Zhang @*/
2118c3ff71bSJunchao Zhang PetscErrorCode  MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
2128c3ff71bSJunchao Zhang {
2138c3ff71bSJunchao Zhang   PetscErrorCode ierr;
2148c3ff71bSJunchao Zhang   PetscMPIInt    size;
2158c3ff71bSJunchao Zhang 
2168c3ff71bSJunchao Zhang   PetscFunctionBegin;
2178c3ff71bSJunchao Zhang   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2188c3ff71bSJunchao Zhang   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
219ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
2208c3ff71bSJunchao Zhang   if (size > 1) {
2218c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr);
2228c3ff71bSJunchao Zhang     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2238c3ff71bSJunchao Zhang   } else {
2248c3ff71bSJunchao Zhang     ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr);
2258c3ff71bSJunchao Zhang     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2268c3ff71bSJunchao Zhang   }
2278c3ff71bSJunchao Zhang   PetscFunctionReturn(0);
2288c3ff71bSJunchao Zhang }
2298c3ff71bSJunchao Zhang 
230a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
231042217e8SBarry Smith PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
232a587d139SMark {
233a587d139SMark   PetscMPIInt                size,rank;
234a587d139SMark   MPI_Comm                   comm;
235a587d139SMark   PetscErrorCode             ierr;
236042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat=NULL;
237a587d139SMark 
238a587d139SMark   PetscFunctionBegin;
239a587d139SMark   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
24055b25c41SPierre Jolivet   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
24155b25c41SPierre Jolivet   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
242a587d139SMark   if (size == 1) {
243a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr);
244a587d139SMark   } else {
245a587d139SMark     Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
246a587d139SMark     ierr   = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr);
247a587d139SMark   }
248a587d139SMark   // act like MatSetValues because not called on host
249a587d139SMark   if (A->assembled) {
250a587d139SMark     if (A->was_assembled) {
251a587d139SMark       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
252a587d139SMark     }
253a587d139SMark     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
254a587d139SMark   } else {
255a587d139SMark     ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr);
256a587d139SMark     // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
257a587d139SMark   }
258a587d139SMark   if (!d_mat) {
259042217e8SBarry Smith     struct _n_SplitCSRMat h_mat; /* host container */
260a587d139SMark     Mat_SeqAIJKokkos      *aijkokA;
261a587d139SMark     Mat_SeqAIJ            *jaca;
262a587d139SMark     PetscInt              n = A->rmap->n, nnz;
263a587d139SMark     Mat                   Amat;
264042217e8SBarry Smith     PetscInt              *colmap;
265042217e8SBarry Smith 
266042217e8SBarry Smith     /* create and copy h_mat */
267a587d139SMark     ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr);
268a587d139SMark     if (size == 1) {
269a587d139SMark       Amat = A;
270a587d139SMark       jaca = (Mat_SeqAIJ*)A->data;
271a587d139SMark       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
272a587d139SMark       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
273a587d139SMark       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
274a587d139SMark       h_mat.offdiag.a = NULL;
275a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr);
276a587d139SMark       aijkokA->i_uncompressed_d = NULL;
277a587d139SMark       aijkokA->colmap_d = NULL;
278a587d139SMark     } else {
279a587d139SMark       Mat_MPIAIJ       *aij = (Mat_MPIAIJ*)A->data;
280a587d139SMark       Mat_SeqAIJ       *jacb = (Mat_SeqAIJ*)aij->B->data;
281a587d139SMark       PetscInt         ii;
282a587d139SMark       Mat_SeqAIJKokkos *aijkokB;
283042217e8SBarry Smith 
284a587d139SMark       Amat = aij->A;
285a587d139SMark       aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr);
286a587d139SMark       aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr);
287a587d139SMark       aijkokA->i_uncompressed_d = NULL;
288a587d139SMark       aijkokA->colmap_d = NULL;
289a587d139SMark       jaca = (Mat_SeqAIJ*)aij->A->data;
290b3c64f9dSJunchao Zhang       if (aij->B->cmap->n && !aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
291a587d139SMark       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
292a587d139SMark       aij->donotstash = PETSC_TRUE;
293a587d139SMark       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
294*a5b23f4aSJose E. Roman       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly
295042217e8SBarry Smith       ierr = PetscCalloc1(A->cmap->N,&colmap);CHKERRQ(ierr);
296042217e8SBarry Smith       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt));CHKERRQ(ierr);
297042217e8SBarry Smith       for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1;
298a587d139SMark       // allocate B copy data
299a587d139SMark       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
300a587d139SMark       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
301a587d139SMark       nnz = jacb->i[n];
302a587d139SMark       if (jacb->compressedrow.use) {
303a587d139SMark         const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1);
3049d676ae4SMark Adams         aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k));
305a587d139SMark         Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k);
306a587d139SMark         h_mat.offdiag.i = aijkokB->i_uncompressed_d->data();
307a587d139SMark       } else {
308a587d139SMark          h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data();
309a587d139SMark       }
310a587d139SMark       h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data();
311a587d139SMark       h_mat.offdiag.a = aijkokB->a_d.data();
312a587d139SMark       {
313042217e8SBarry Smith         Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N);
3149d676ae4SMark Adams         aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k));
315a587d139SMark         Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k);
316a587d139SMark         h_mat.colmap = aijkokB->colmap_d->data();
317042217e8SBarry Smith         ierr = PetscFree(colmap);CHKERRQ(ierr);
318a587d139SMark       }
319a587d139SMark       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
320a587d139SMark       h_mat.offdiag.n = n;
321a587d139SMark     }
322a587d139SMark     // allocate A copy data
323a587d139SMark     nnz = jaca->i[n];
324a587d139SMark     h_mat.diag.n = n;
325a587d139SMark     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
32655b25c41SPierre Jolivet     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
327042217e8SBarry Smith     if (jaca->compressedrow.use) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)");
328042217e8SBarry Smith     else {
329a587d139SMark       h_mat.diag.i = (PetscInt*)aijkokA->i_d.data();
330a587d139SMark     }
331a587d139SMark     h_mat.diag.j = (PetscInt*)aijkokA->j_d.data();
332a587d139SMark     h_mat.diag.a = aijkokA->a_d.data();
333a587d139SMark     // copy pointers and metdata to device
334a587d139SMark     ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr);
335a587d139SMark     ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr);
336a587d139SMark     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
337a587d139SMark   }
338a587d139SMark   *B = d_mat; // return it, set it in Mat, and set it up
339a587d139SMark   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
340a587d139SMark   PetscFunctionReturn(0);
341a587d139SMark }
342