18c3ff71bSJunchao Zhang #include <petscconf.h> 28c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h> 3a587d139SMark #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 4a587d139SMark #include <petscveckokkos.hpp> 58c3ff71bSJunchao Zhang 68c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 78c3ff71bSJunchao Zhang { 88c3ff71bSJunchao Zhang PetscErrorCode ierr; 98c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 10a587d139SMark Mat_SeqAIJKokkos *aijkok = mpiaij->A->spptr ? static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr) : NULL; 118c3ff71bSJunchao Zhang 128c3ff71bSJunchao Zhang PetscFunctionBegin; 138c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 148c3ff71bSJunchao Zhang if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 158c3ff71bSJunchao Zhang ierr = VecSetType(mpiaij->lvec,VECSEQKOKKOS);CHKERRQ(ierr); 168c3ff71bSJunchao Zhang } 17a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 18a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 19a587d139SMark } 20a587d139SMark 218c3ff71bSJunchao Zhang PetscFunctionReturn(0); 228c3ff71bSJunchao Zhang } 238c3ff71bSJunchao Zhang 248c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 258c3ff71bSJunchao Zhang { 268c3ff71bSJunchao Zhang PetscErrorCode ierr; 278c3ff71bSJunchao Zhang PetscInt i; 288c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 298c3ff71bSJunchao Zhang 308c3ff71bSJunchao Zhang PetscFunctionBegin; 318c3ff71bSJunchao Zhang ierr = PetscLayoutSetUp(mat->rmap);CHKERRQ(ierr); 328c3ff71bSJunchao Zhang ierr = PetscLayoutSetUp(mat->cmap);CHKERRQ(ierr); 338c3ff71bSJunchao Zhang if (d_nnz) { 348c3ff71bSJunchao Zhang for (i=0; i<mat->rmap->n; i++) { 358c3ff71bSJunchao Zhang if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 368c3ff71bSJunchao Zhang } 378c3ff71bSJunchao Zhang } 388c3ff71bSJunchao Zhang if (o_nnz) { 398c3ff71bSJunchao Zhang for (i=0; i<mat->rmap->n; i++) { 408c3ff71bSJunchao Zhang if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 418c3ff71bSJunchao Zhang } 428c3ff71bSJunchao Zhang } 438c3ff71bSJunchao Zhang if (!mat->preallocated) { 448c3ff71bSJunchao Zhang /* Explicitly create 2 MATSEQAIJKOKKOS matrices. */ 458c3ff71bSJunchao Zhang ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->A);CHKERRQ(ierr); 468c3ff71bSJunchao Zhang ierr = MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n);CHKERRQ(ierr); 478c3ff71bSJunchao Zhang ierr = MatSetType(mpiaij->A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 488c3ff71bSJunchao Zhang ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A);CHKERRQ(ierr); 498c3ff71bSJunchao Zhang ierr = MatCreate(PETSC_COMM_SELF,&mpiaij->B);CHKERRQ(ierr); 508c3ff71bSJunchao Zhang ierr = MatSetSizes(mpiaij->B,mat->rmap->n,mat->cmap->N,mat->rmap->n,mat->cmap->N);CHKERRQ(ierr); 518c3ff71bSJunchao Zhang ierr = MatSetType(mpiaij->B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 528c3ff71bSJunchao Zhang ierr = PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B);CHKERRQ(ierr); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz);CHKERRQ(ierr); 558c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz);CHKERRQ(ierr); 568c3ff71bSJunchao Zhang 578c3ff71bSJunchao Zhang mat->preallocated = PETSC_TRUE; 588c3ff71bSJunchao Zhang PetscFunctionReturn(0); 598c3ff71bSJunchao Zhang } 608c3ff71bSJunchao Zhang 618c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 628c3ff71bSJunchao Zhang { 638c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 648c3ff71bSJunchao Zhang PetscErrorCode ierr; 658c3ff71bSJunchao Zhang PetscInt nt; 668c3ff71bSJunchao Zhang 678c3ff71bSJunchao Zhang PetscFunctionBegin; 688c3ff71bSJunchao Zhang ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 698c3ff71bSJunchao Zhang if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 708c3ff71bSJunchao Zhang ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 718c3ff71bSJunchao Zhang ierr = (*mpiaij->A->ops->mult)(mpiaij->A,xx,yy);CHKERRQ(ierr); 728c3ff71bSJunchao Zhang ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 738c3ff71bSJunchao Zhang ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy);CHKERRQ(ierr); 748c3ff71bSJunchao Zhang PetscFunctionReturn(0); 758c3ff71bSJunchao Zhang } 768c3ff71bSJunchao Zhang 778c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 788c3ff71bSJunchao Zhang { 798c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 808c3ff71bSJunchao Zhang PetscErrorCode ierr; 818c3ff71bSJunchao Zhang PetscInt nt; 828c3ff71bSJunchao Zhang 838c3ff71bSJunchao Zhang PetscFunctionBegin; 848c3ff71bSJunchao Zhang ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 858c3ff71bSJunchao Zhang if (nt != mat->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->cmap->n,nt); 868c3ff71bSJunchao Zhang ierr = VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 878c3ff71bSJunchao Zhang ierr = (*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz);CHKERRQ(ierr); 888c3ff71bSJunchao Zhang ierr = VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 898c3ff71bSJunchao Zhang ierr = (*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz);CHKERRQ(ierr); 908c3ff71bSJunchao Zhang PetscFunctionReturn(0); 918c3ff71bSJunchao Zhang } 928c3ff71bSJunchao Zhang 938c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 948c3ff71bSJunchao Zhang { 958c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 968c3ff71bSJunchao Zhang PetscErrorCode ierr; 978c3ff71bSJunchao Zhang PetscInt nt; 988c3ff71bSJunchao Zhang 998c3ff71bSJunchao Zhang PetscFunctionBegin; 1008c3ff71bSJunchao Zhang ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 1018c3ff71bSJunchao Zhang if (nt != mat->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%D) and xx (%D)",mat->rmap->n,nt); 1028c3ff71bSJunchao Zhang ierr = (*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec);CHKERRQ(ierr); 1038c3ff71bSJunchao Zhang ierr = (*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy);CHKERRQ(ierr); 1048c3ff71bSJunchao Zhang ierr = VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1058c3ff71bSJunchao Zhang ierr = VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1068c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1078c3ff71bSJunchao Zhang } 1088c3ff71bSJunchao Zhang 1098c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 1108c3ff71bSJunchao Zhang { 1118c3ff71bSJunchao Zhang PetscErrorCode ierr; 1128c3ff71bSJunchao Zhang Mat B; 1138c3ff71bSJunchao Zhang 1148c3ff71bSJunchao Zhang PetscFunctionBegin; 1158c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 1168c3ff71bSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 1178c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 1188c3ff71bSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 1198c3ff71bSJunchao Zhang } 1208c3ff71bSJunchao Zhang B = *newmat; 1218c3ff71bSJunchao Zhang 1228c3ff71bSJunchao Zhang ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1238c3ff71bSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&B->defaultvectype);CHKERRQ(ierr); 1243d0639e7SStefano Zampini ierr = PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS);CHKERRQ(ierr); 1258c3ff71bSJunchao Zhang 1268c3ff71bSJunchao Zhang B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 1278c3ff71bSJunchao Zhang B->ops->mult = MatMult_MPIAIJKokkos; 1288c3ff71bSJunchao Zhang B->ops->multadd = MatMultAdd_MPIAIJKokkos; 1298c3ff71bSJunchao Zhang B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1308c3ff71bSJunchao Zhang 1313d0639e7SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos);CHKERRQ(ierr); 1328c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1338c3ff71bSJunchao Zhang } 1348c3ff71bSJunchao Zhang 1358c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 1368c3ff71bSJunchao Zhang { 1378c3ff71bSJunchao Zhang PetscErrorCode ierr; 1388c3ff71bSJunchao Zhang 1398c3ff71bSJunchao Zhang PetscFunctionBegin; 1408c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 1418c3ff71bSJunchao Zhang ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 1428c3ff71bSJunchao Zhang ierr = MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 1438c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1448c3ff71bSJunchao Zhang } 1458c3ff71bSJunchao Zhang 1468c3ff71bSJunchao Zhang /*@C 1478c3ff71bSJunchao Zhang MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 1488c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately pushed down 1498c3ff71bSJunchao Zhang to Kokkos for calculations. For good matrix 1508c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 1518c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 1528c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 1538c3ff71bSJunchao Zhang 1548c3ff71bSJunchao Zhang Collective 1558c3ff71bSJunchao Zhang 1568c3ff71bSJunchao Zhang Input Parameters: 1578c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 1588c3ff71bSJunchao Zhang . m - number of rows 1598c3ff71bSJunchao Zhang . n - number of columns 1608c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 1618c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 1628c3ff71bSJunchao Zhang (possibly different for each row) or NULL 1638c3ff71bSJunchao Zhang 1648c3ff71bSJunchao Zhang Output Parameter: 1658c3ff71bSJunchao Zhang . A - the matrix 1668c3ff71bSJunchao Zhang 1678c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1688c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradigm instead of this routine directly. 1698c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1708c3ff71bSJunchao Zhang 1718c3ff71bSJunchao Zhang Notes: 1728c3ff71bSJunchao Zhang If nnz is given then nz is ignored 1738c3ff71bSJunchao Zhang 1748c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 1758c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 1768c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 1778c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 1788c3ff71bSJunchao Zhang 1798c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 1808c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1818c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 1828c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 1838c3ff71bSJunchao Zhang 1848c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 1858c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 1868c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 1878c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 1888c3ff71bSJunchao Zhang 1898c3ff71bSJunchao Zhang Level: intermediate 1908c3ff71bSJunchao Zhang 1918c3ff71bSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJKOKKOS, MATAIJKokkos 1928c3ff71bSJunchao Zhang @*/ 1938c3ff71bSJunchao Zhang PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 1948c3ff71bSJunchao Zhang { 1958c3ff71bSJunchao Zhang PetscErrorCode ierr; 1968c3ff71bSJunchao Zhang PetscMPIInt size; 1978c3ff71bSJunchao Zhang 1988c3ff71bSJunchao Zhang PetscFunctionBegin; 1998c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 2008c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 201*ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 2028c3ff71bSJunchao Zhang if (size > 1) { 2038c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATMPIAIJKOKKOS);CHKERRQ(ierr); 2048c3ff71bSJunchao Zhang ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 2058c3ff71bSJunchao Zhang } else { 2068c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 2078c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 2088c3ff71bSJunchao Zhang } 2098c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2108c3ff71bSJunchao Zhang } 2118c3ff71bSJunchao Zhang 212a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 213a587d139SMark PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 214a587d139SMark { 215a587d139SMark #if defined(PETSC_USE_CTABLE) 216a587d139SMark SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 217a587d139SMark #else 218a587d139SMark PetscMPIInt size,rank; 219a587d139SMark MPI_Comm comm; 220a587d139SMark PetscErrorCode ierr; 221a587d139SMark PetscSplitCSRDataStructure *d_mat=NULL; 222a587d139SMark 223a587d139SMark PetscFunctionBegin; 224a587d139SMark ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 225a587d139SMark ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 226a587d139SMark ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 227a587d139SMark if (size == 1) { 228a587d139SMark ierr = MatSeqAIJKokkosGetDeviceMat(A,&d_mat);CHKERRQ(ierr); 229a587d139SMark } else { 230a587d139SMark Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 231a587d139SMark ierr = MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat);CHKERRQ(ierr); 232a587d139SMark } 233a587d139SMark // act like MatSetValues because not called on host 234a587d139SMark if (A->assembled) { 235a587d139SMark if (A->was_assembled) { 236a587d139SMark ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 237a587d139SMark } 238a587d139SMark A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 239a587d139SMark } else { 240a587d139SMark ierr = PetscInfo1(A,"Warning !assemble ??? assembled=%D\n",A->assembled);CHKERRQ(ierr); 241a587d139SMark // SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 242a587d139SMark } 243a587d139SMark if (!d_mat) { 244a587d139SMark PetscSplitCSRDataStructure h_mat; /* host container */ 245a587d139SMark Mat_SeqAIJKokkos *aijkokA; 246a587d139SMark Mat_SeqAIJ *jaca; 247a587d139SMark PetscInt n = A->rmap->n, nnz; 248a587d139SMark Mat Amat; 249a587d139SMark // create and copy 250a587d139SMark ierr = PetscInfo(A,"Create device matrix in Kokkos\n");CHKERRQ(ierr); 251a587d139SMark if (size == 1) { 252a587d139SMark Amat = A; 253a587d139SMark jaca = (Mat_SeqAIJ*)A->data; 254a587d139SMark h_mat.rstart = 0; h_mat.rend = A->rmap->n; 255a587d139SMark h_mat.cstart = 0; h_mat.cend = A->cmap->n; 256a587d139SMark h_mat.offdiag.i = h_mat.offdiag.j = NULL; 257a587d139SMark h_mat.offdiag.a = NULL; 258a587d139SMark aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 259a587d139SMark aijkokA->i_uncompressed_d = NULL; 260a587d139SMark aijkokA->colmap_d = NULL; 261a587d139SMark } else { 262a587d139SMark Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 263a587d139SMark Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 264a587d139SMark PetscInt ii; 265a587d139SMark Mat_SeqAIJKokkos *aijkokB; 266a587d139SMark Amat = aij->A; 267a587d139SMark aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 268a587d139SMark aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 269a587d139SMark aijkokA->i_uncompressed_d = NULL; 270a587d139SMark aijkokA->colmap_d = NULL; 271a587d139SMark jaca = (Mat_SeqAIJ*)aij->A->data; 272a587d139SMark if (!aij->garray) SETERRQ(comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 273a587d139SMark if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 274a587d139SMark // create colmap - this is ussually done (lazy) in MatSetValues 275a587d139SMark aij->donotstash = PETSC_TRUE; 276a587d139SMark aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 277a587d139SMark jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 278a587d139SMark ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 279a587d139SMark aij->colmap[A->cmap->N] = -9; 280a587d139SMark ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 281a587d139SMark for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 282a587d139SMark if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 283a587d139SMark // allocate B copy data 284a587d139SMark h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 285a587d139SMark h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 286a587d139SMark nnz = jacb->i[n]; 287a587d139SMark if (jacb->compressedrow.use) { 288a587d139SMark const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 289a587d139SMark aijkokB->i_uncompressed_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_i_k)); 290a587d139SMark Kokkos::deep_copy (*aijkokB->i_uncompressed_d, h_i_k); 291a587d139SMark h_mat.offdiag.i = aijkokB->i_uncompressed_d->data(); 292a587d139SMark //err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 293a587d139SMark //err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 294a587d139SMark } else { 295a587d139SMark h_mat.offdiag.i = (PetscInt*)aijkokB->i_d.data(); 296a587d139SMark } 297a587d139SMark h_mat.offdiag.j = (PetscInt*)aijkokB->j_d.data(); 298a587d139SMark h_mat.offdiag.a = aijkokB->a_d.data(); 299a587d139SMark { 300a587d139SMark Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (aij->colmap,A->cmap->N); 301a587d139SMark aijkokB->colmap_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DeviceMemorySpace(),h_colmap_k)); 302a587d139SMark Kokkos::deep_copy (*aijkokB->colmap_d, h_colmap_k); 303a587d139SMark h_mat.colmap = aijkokB->colmap_d->data(); 304a587d139SMark } 305a587d139SMark //err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 306a587d139SMark //err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 307a587d139SMark h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 308a587d139SMark h_mat.offdiag.n = n; 309a587d139SMark } 310a587d139SMark // allocate A copy data 311a587d139SMark nnz = jaca->i[n]; 312a587d139SMark h_mat.diag.n = n; 313a587d139SMark h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 314a587d139SMark ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 315a587d139SMark if (jaca->compressedrow.use) { 316a587d139SMark SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 317a587d139SMark } else { 318a587d139SMark h_mat.diag.i = (PetscInt*)aijkokA->i_d.data(); 319a587d139SMark } 320a587d139SMark //h_mat.diag.j = aj; 321a587d139SMark //h_mat.diag.a = aa; 322a587d139SMark h_mat.diag.j = (PetscInt*)aijkokA->j_d.data(); 323a587d139SMark h_mat.diag.a = aijkokA->a_d.data(); 324a587d139SMark // copy pointers and metdata to device 325a587d139SMark ierr = MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat);CHKERRQ(ierr); 326a587d139SMark ierr = MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat);CHKERRQ(ierr); 327a587d139SMark ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 328a587d139SMark } 329a587d139SMark *B = d_mat; // return it, set it in Mat, and set it up 330a587d139SMark A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 331a587d139SMark PetscFunctionReturn(0); 332a587d139SMark #endif 333a587d139SMark } 334