12327d61dSSatish Balay #include "petscconf.h" 22327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN 39ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END 5bbf3fe20SPaul Mullowney #include "mpicusparsematimpl.h" 69ae82921SPaul Mullowney 79ae82921SPaul Mullowney #undef __FUNCT__ 89ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE" 99ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 109ae82921SPaul Mullowney { 11bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 12bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 139ae82921SPaul Mullowney PetscErrorCode ierr; 149ae82921SPaul Mullowney PetscInt i; 159ae82921SPaul Mullowney 169ae82921SPaul Mullowney PetscFunctionBegin; 179ae82921SPaul Mullowney if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5; 189ae82921SPaul Mullowney if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2; 199ae82921SPaul Mullowney if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz); 209ae82921SPaul Mullowney if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz); 219ae82921SPaul Mullowney 229ae82921SPaul Mullowney ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr); 239ae82921SPaul Mullowney ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr); 249ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 259ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 269ae82921SPaul Mullowney if (d_nnz) { 279ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 289ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney } 319ae82921SPaul Mullowney if (o_nnz) { 329ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 339ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 349ae82921SPaul Mullowney } 359ae82921SPaul Mullowney } 369ae82921SPaul Mullowney if (!B->preallocated) { 37bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 389ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 409ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 419ae82921SPaul Mullowney ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr); 429ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 439ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 449ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 459ae82921SPaul Mullowney ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr); 469ae82921SPaul Mullowney } 479ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 489ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 49e057df02SPaul Mullowney ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 50e057df02SPaul Mullowney ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 519ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 529ae82921SPaul Mullowney PetscFunctionReturn(0); 539ae82921SPaul Mullowney } 54e057df02SPaul Mullowney 559ae82921SPaul Mullowney #undef __FUNCT__ 569ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE" 579ae82921SPaul Mullowney PetscErrorCode MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left) 589ae82921SPaul Mullowney { 599ae82921SPaul Mullowney PetscErrorCode ierr; 609ae82921SPaul Mullowney 619ae82921SPaul Mullowney PetscFunctionBegin; 629ae82921SPaul Mullowney if (right) { 639ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 649ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 659ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr); 679ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 689ae82921SPaul Mullowney } 699ae82921SPaul Mullowney if (left) { 709ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 719ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 729ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 739ae82921SPaul Mullowney ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr); 749ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 759ae82921SPaul Mullowney } 769ae82921SPaul Mullowney PetscFunctionReturn(0); 779ae82921SPaul Mullowney } 789ae82921SPaul Mullowney 799ae82921SPaul Mullowney 809ae82921SPaul Mullowney #undef __FUNCT__ 819ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 829ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 839ae82921SPaul Mullowney { 84e057df02SPaul Mullowney /* This multiplication sequence is different sequence 85e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 86e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 87e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 88e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 89e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 90e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 91e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 92e057df02SPaul Mullowney against race conditions. 93e057df02SPaul Mullowney 94e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 959ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 969ae82921SPaul Mullowney PetscErrorCode ierr; 979ae82921SPaul Mullowney PetscInt nt; 989ae82921SPaul Mullowney 999ae82921SPaul Mullowney PetscFunctionBegin; 1009ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 1019ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 1029ae82921SPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 1039ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 1049ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1059ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1069ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 1079ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 1089ae82921SPaul Mullowney PetscFunctionReturn(0); 1099ae82921SPaul Mullowney } 110ca45077fSPaul Mullowney 111ca45077fSPaul Mullowney #undef __FUNCT__ 112ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE" 113ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 114ca45077fSPaul Mullowney { 115e057df02SPaul Mullowney /* This multiplication sequence is different sequence 116e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 117e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 118e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 119e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 120e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 121e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 122e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 123e057df02SPaul Mullowney against race conditions. 124e057df02SPaul Mullowney 125e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 126ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 127ca45077fSPaul Mullowney PetscErrorCode ierr; 128ca45077fSPaul Mullowney PetscInt nt; 129ca45077fSPaul Mullowney 130ca45077fSPaul Mullowney PetscFunctionBegin; 131ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 132ca45077fSPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 133ca45077fSPaul Mullowney ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr); 134ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 135ca45077fSPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 136ca45077fSPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 137ca45077fSPaul Mullowney ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 138ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 139ca45077fSPaul Mullowney PetscFunctionReturn(0); 140ca45077fSPaul Mullowney } 1419ae82921SPaul Mullowney 142e057df02SPaul Mullowney /*PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats); */ 143ca45077fSPaul Mullowney 144ca45077fSPaul Mullowney EXTERN_C_BEGIN 145ca45077fSPaul Mullowney #undef __FUNCT__ 146e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE" 147e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 148ca45077fSPaul Mullowney { 149ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 150bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 151e057df02SPaul Mullowney 152ca45077fSPaul Mullowney PetscFunctionBegin; 153e057df02SPaul Mullowney switch (op) { 154e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 155e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 156045c96e1SPaul Mullowney break; 157e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 158e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 159045c96e1SPaul Mullowney break; 160e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 161e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 162e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 163045c96e1SPaul Mullowney break; 164e057df02SPaul Mullowney default: 165e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 166045c96e1SPaul Mullowney } 167ca45077fSPaul Mullowney PetscFunctionReturn(0); 168ca45077fSPaul Mullowney } 169ca45077fSPaul Mullowney EXTERN_C_END 170ca45077fSPaul Mullowney 171e057df02SPaul Mullowney 172e057df02SPaul Mullowney #undef __FUNCT__ 173e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE" 174e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A) 175e057df02SPaul Mullowney { 176e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 177e057df02SPaul Mullowney PetscErrorCode ierr; 178e057df02SPaul Mullowney PetscBool flg; 179e057df02SPaul Mullowney PetscFunctionBegin; 180e057df02SPaul Mullowney ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr); 181e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 182e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 183e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 184*7083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185e057df02SPaul Mullowney if (flg) { 186e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 187e057df02SPaul Mullowney } 188e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 189*7083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190e057df02SPaul Mullowney if (flg) { 191e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 192e057df02SPaul Mullowney } 193e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 194*7083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 195e057df02SPaul Mullowney if (flg) { 196e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 197e057df02SPaul Mullowney } 198e057df02SPaul Mullowney } 199e057df02SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 200e057df02SPaul Mullowney PetscFunctionReturn(0); 201e057df02SPaul Mullowney } 202e057df02SPaul Mullowney 203bbf3fe20SPaul Mullowney #undef __FUNCT__ 204bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE" 205bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 206bbf3fe20SPaul Mullowney { 207bbf3fe20SPaul Mullowney PetscErrorCode ierr; 208bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 209bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 210bbf3fe20SPaul Mullowney 211bbf3fe20SPaul Mullowney PetscFunctionBegin; 212bbf3fe20SPaul Mullowney try { 213bbf3fe20SPaul Mullowney delete cusparseStruct; 214bbf3fe20SPaul Mullowney } catch(char* ex) { 215bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 216bbf3fe20SPaul Mullowney } 217bbf3fe20SPaul Mullowney cusparseStruct = 0; 218bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 219bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 220bbf3fe20SPaul Mullowney } 221ca45077fSPaul Mullowney 2229ae82921SPaul Mullowney EXTERN_C_BEGIN 2239ae82921SPaul Mullowney #undef __FUNCT__ 2249ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE" 225bbf3fe20SPaul Mullowney PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2269ae82921SPaul Mullowney { 2279ae82921SPaul Mullowney PetscErrorCode ierr; 228bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 229bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 2309ae82921SPaul Mullowney 2319ae82921SPaul Mullowney PetscFunctionBegin; 232bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 233bbf3fe20SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C", 2349ae82921SPaul Mullowney "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE", 2359ae82921SPaul Mullowney MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 236bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 237bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 238bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 239e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 240e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 241bbf3fe20SPaul Mullowney A->ops->getvecs = MatGetVecs_MPIAIJCUSPARSE; 242bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 243bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 244bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 245bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 246bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 247e057df02SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 248e057df02SPaul Mullowney 2499ae82921SPaul Mullowney PetscFunctionReturn(0); 2509ae82921SPaul Mullowney } 2519ae82921SPaul Mullowney EXTERN_C_END 2529ae82921SPaul Mullowney 2533f9c0db1SPaul Mullowney /*@ 2543f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 255e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2563f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 257e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 258e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 259e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 260e057df02SPaul Mullowney This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu 261e057df02SPaul Mullowney to build/install PETSc to use different CUSPARSE base matrix types. 2629ae82921SPaul Mullowney 263e057df02SPaul Mullowney Collective on MPI_Comm 264e057df02SPaul Mullowney 265e057df02SPaul Mullowney Input Parameters: 266e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 267e057df02SPaul Mullowney . m - number of rows 268e057df02SPaul Mullowney . n - number of columns 269e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 270e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 271e057df02SPaul Mullowney (possibly different for each row) or PETSC_NULL 272e057df02SPaul Mullowney 273e057df02SPaul Mullowney Output Parameter: 274e057df02SPaul Mullowney . A - the matrix 275e057df02SPaul Mullowney 276e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 277e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 278e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 279e057df02SPaul Mullowney 280e057df02SPaul Mullowney Notes: 281e057df02SPaul Mullowney If nnz is given then nz is ignored 282e057df02SPaul Mullowney 283e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 284e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 285e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 286e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 287e057df02SPaul Mullowney 288e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 289e057df02SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 290e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 291e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 292e057df02SPaul Mullowney 293e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 294e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 295e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 296e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 297e057df02SPaul Mullowney 298e057df02SPaul Mullowney Level: intermediate 299e057df02SPaul Mullowney 300e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 301e057df02SPaul Mullowney @*/ 3029ae82921SPaul Mullowney #undef __FUNCT__ 3039ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE" 3049ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3059ae82921SPaul Mullowney { 3069ae82921SPaul Mullowney PetscErrorCode ierr; 3079ae82921SPaul Mullowney PetscMPIInt size; 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney PetscFunctionBegin; 3109ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3119ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3129ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3139ae82921SPaul Mullowney if (size > 1) { 3149ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3159ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3169ae82921SPaul Mullowney } else { 3179ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3189ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3199ae82921SPaul Mullowney } 3209ae82921SPaul Mullowney PetscFunctionReturn(0); 3219ae82921SPaul Mullowney } 3229ae82921SPaul Mullowney 3233f9c0db1SPaul Mullowney /*M 324e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 325e057df02SPaul Mullowney 326e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format. 327e057df02SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the 328e057df02SPaul Mullowney CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available 329e057df02SPaul Mullowney in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different 330e057df02SPaul Mullowney GPU storage formats with CUSPARSE matrix types. 3319ae82921SPaul Mullowney 3329ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3339ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3349ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3359ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3369ae82921SPaul Mullowney the above preallocation routines for simplicity. 3379ae82921SPaul Mullowney 3389ae82921SPaul Mullowney Options Database Keys: 339e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 340e057df02SPaul Mullowney . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). 341e057df02SPaul Mullowney . -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions(). 342e057df02SPaul Mullowney - -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). 3439ae82921SPaul Mullowney 3449ae82921SPaul Mullowney Level: beginner 3459ae82921SPaul Mullowney 346e057df02SPaul Mullowney .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3479ae82921SPaul Mullowney M*/ 348