19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 39ae82921SPaul Mullowney matrix storage format. 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 99ae82921SPaul Mullowney //#include "petscbt.h" 109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 139ae82921SPaul Mullowney #undef VecType 149ae82921SPaul Mullowney #include "cusparsematimpl.h" 15*e057df02SPaul Mullowney const char * const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 169ae82921SPaul Mullowney 17*e057df02SPaul Mullowney /* this is such a hack ... but I don't know of another way to pass this variable 18*e057df02SPaul Mullowney from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 19*e057df02SPaul Mullowney SpMV. Essentially, I need to use the same stream variable in two different 20*e057df02SPaul Mullowney data structures. I do this by creating a single instance of that stream 21*e057df02SPaul Mullowney and reuse it. */ 22ca45077fSPaul Mullowney cudaStream_t theBodyStream=0; 239ae82921SPaul Mullowney 249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 30*e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat); 318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 359ae82921SPaul Mullowney 369ae82921SPaul Mullowney #undef __FUNCT__ 379ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 389ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 399ae82921SPaul Mullowney { 409ae82921SPaul Mullowney PetscFunctionBegin; 419ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 429ae82921SPaul Mullowney PetscFunctionReturn(0); 439ae82921SPaul Mullowney } 449ae82921SPaul Mullowney 459ae82921SPaul Mullowney EXTERN_C_BEGIN 469ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 479ae82921SPaul Mullowney EXTERN_C_END 48*e057df02SPaul Mullowney /* 499ae82921SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 509ae82921SPaul Mullowney on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 519ae82921SPaul Mullowney 529ae82921SPaul Mullowney Level: beginner 53*e057df02SPaul Mullowney */ 549ae82921SPaul Mullowney 559ae82921SPaul Mullowney EXTERN_C_BEGIN 569ae82921SPaul Mullowney #undef __FUNCT__ 579ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 589ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 599ae82921SPaul Mullowney { 609ae82921SPaul Mullowney PetscErrorCode ierr; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 659ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 679ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 689ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 699ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 709ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 719ae82921SPaul Mullowney (*B)->factortype = ftype; 729ae82921SPaul Mullowney PetscFunctionReturn(0); 739ae82921SPaul Mullowney } 749ae82921SPaul Mullowney EXTERN_C_END 759ae82921SPaul Mullowney 76*e057df02SPaul Mullowney EXTERN_C_BEGIN 779ae82921SPaul Mullowney #undef __FUNCT__ 78*e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 79*e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 80ca45077fSPaul Mullowney { 81ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 82ca45077fSPaul Mullowney PetscFunctionBegin; 83ca45077fSPaul Mullowney switch (op) { 84*e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 85*e057df02SPaul Mullowney cusparseMat->format = format; 86ca45077fSPaul Mullowney break; 87*e057df02SPaul Mullowney case MAT_CUSPARSE_SOLVE: 88*e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 89ca45077fSPaul Mullowney break; 90*e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 91*e057df02SPaul Mullowney cusparseMat->format = format; 92*e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 93ca45077fSPaul Mullowney break; 94ca45077fSPaul Mullowney default: 95*e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 96ca45077fSPaul Mullowney } 97ca45077fSPaul Mullowney PetscFunctionReturn(0); 98ca45077fSPaul Mullowney } 99*e057df02SPaul Mullowney EXTERN_C_END 1009ae82921SPaul Mullowney 101*e057df02SPaul Mullowney 102*e057df02SPaul Mullowney /*@ 103*e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 104*e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 105*e057df02SPaul Mullowney for AIJCUSPARSE matrices. This requires the txpetscgpu package. Use --download-txpetscgpu 106*e057df02SPaul Mullowney to build/install PETSc to use this package. 107*e057df02SPaul Mullowney 108*e057df02SPaul Mullowney Not Collective 109*e057df02SPaul Mullowney 110*e057df02SPaul Mullowney Input Parameters: 111*e057df02SPaul Mullowney + A : Matrix of type SEQAIJCUSPARSE 112*e057df02SPaul Mullowney . op : MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 113*e057df02SPaul Mullowney - format : MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 114*e057df02SPaul Mullowney 115*e057df02SPaul Mullowney Output Parameter: 116*e057df02SPaul Mullowney 117*e057df02SPaul Mullowney Level: intermediate 118*e057df02SPaul Mullowney 119*e057df02SPaul Mullowney .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEARSEFormatOperation 120*e057df02SPaul Mullowney @*/ 121*e057df02SPaul Mullowney #undef __FUNCT__ 122*e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 123*e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124*e057df02SPaul Mullowney { 125*e057df02SPaul Mullowney PetscErrorCode ierr; 126*e057df02SPaul Mullowney PetscFunctionBegin; 127*e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 128*e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 129*e057df02SPaul Mullowney PetscFunctionReturn(0); 130*e057df02SPaul Mullowney } 131*e057df02SPaul Mullowney 1329ae82921SPaul Mullowney #undef __FUNCT__ 1339ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1349ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1359ae82921SPaul Mullowney { 1369ae82921SPaul Mullowney PetscErrorCode ierr; 137*e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1389ae82921SPaul Mullowney PetscBool flg; 1399ae82921SPaul Mullowney PetscFunctionBegin; 140*e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 141*e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1429ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 143*e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 144*e057df02SPaul Mullowney "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 145*e057df02SPaul Mullowney if (flg) { 146*e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 147045c96e1SPaul Mullowney } 1489ae82921SPaul Mullowney } 1499ae82921SPaul Mullowney else { 150*e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 151*e057df02SPaul Mullowney "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 152*e057df02SPaul Mullowney if (flg) { 153*e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 154*e057df02SPaul Mullowney } 1559ae82921SPaul Mullowney } 1569ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1579ae82921SPaul Mullowney PetscFunctionReturn(0); 1589ae82921SPaul Mullowney 1599ae82921SPaul Mullowney } 1609ae82921SPaul Mullowney 1619ae82921SPaul Mullowney #undef __FUNCT__ 1629ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1639ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1649ae82921SPaul Mullowney { 1659ae82921SPaul Mullowney PetscErrorCode ierr; 1669ae82921SPaul Mullowney 1679ae82921SPaul Mullowney PetscFunctionBegin; 1689ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1699ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1709ae82921SPaul Mullowney PetscFunctionReturn(0); 1719ae82921SPaul Mullowney } 1729ae82921SPaul Mullowney 1739ae82921SPaul Mullowney #undef __FUNCT__ 1749ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 1759ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1769ae82921SPaul Mullowney { 1779ae82921SPaul Mullowney PetscErrorCode ierr; 1789ae82921SPaul Mullowney 1799ae82921SPaul Mullowney PetscFunctionBegin; 1809ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1819ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1829ae82921SPaul Mullowney PetscFunctionReturn(0); 1839ae82921SPaul Mullowney } 1849ae82921SPaul Mullowney 1859ae82921SPaul Mullowney #undef __FUNCT__ 186*e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildLowerTriMatrix" 187*e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildLowerTriMatrix(Mat A) 1889ae82921SPaul Mullowney { 1899ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1909ae82921SPaul Mullowney PetscInt n = A->rmap->n; 1919ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1929ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 1939ae82921SPaul Mullowney cusparseStatus_t stat; 1949ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 1959ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 1969ae82921SPaul Mullowney PetscErrorCode ierr; 1979ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 1989ae82921SPaul Mullowney PetscScalar *AALo; 1999ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2009ae82921SPaul Mullowney 2019ae82921SPaul Mullowney PetscFunctionBegin; 2029ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 2039ae82921SPaul Mullowney try { 2049ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2059ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2069ae82921SPaul Mullowney 2079ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2089ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2099ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2109ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2119ae82921SPaul Mullowney 2129ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2139ae82921SPaul Mullowney AiLo[0]=(PetscInt) 0; 2149ae82921SPaul Mullowney AiLo[n]=nzLower; 2159ae82921SPaul Mullowney AjLo[0]=(PetscInt) 0; 2169ae82921SPaul Mullowney AALo[0]=(MatScalar) 1.0; 2179ae82921SPaul Mullowney v = aa; 2189ae82921SPaul Mullowney vi = aj; 2199ae82921SPaul Mullowney offset=1; 2209ae82921SPaul Mullowney rowOffset=1; 2219ae82921SPaul Mullowney for (i=1; i<n; i++) { 2229ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 223*e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2249ae82921SPaul Mullowney AiLo[i]=rowOffset; 2259ae82921SPaul Mullowney rowOffset+=nz+1; 2269ae82921SPaul Mullowney 2279ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2289ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2299ae82921SPaul Mullowney 2309ae82921SPaul Mullowney offset+=nz; 2319ae82921SPaul Mullowney AjLo[offset]=(PetscInt) i; 2329ae82921SPaul Mullowney AALo[offset]=(MatScalar) 1.0; 2339ae82921SPaul Mullowney offset+=1; 2349ae82921SPaul Mullowney 2359ae82921SPaul Mullowney v += nz; 2369ae82921SPaul Mullowney vi += nz; 2379ae82921SPaul Mullowney } 238*e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 2399ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 2409ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 2419ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 2429ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 2439ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 2449ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 2459ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 2469ae82921SPaul Mullowney } catch(char* ex) { 2479ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2489ae82921SPaul Mullowney } 2499ae82921SPaul Mullowney } 2509ae82921SPaul Mullowney PetscFunctionReturn(0); 2519ae82921SPaul Mullowney } 2529ae82921SPaul Mullowney 2539ae82921SPaul Mullowney #undef __FUNCT__ 254*e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildUpperTriMatrix" 255*e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEBuildUpperTriMatrix(Mat A) 2569ae82921SPaul Mullowney { 2579ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2589ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2599ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2609ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 2619ae82921SPaul Mullowney cusparseStatus_t stat; 2629ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 2639ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2649ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 2659ae82921SPaul Mullowney PetscScalar *AAUp; 2669ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 2679ae82921SPaul Mullowney PetscErrorCode ierr; 2689ae82921SPaul Mullowney 2699ae82921SPaul Mullowney PetscFunctionBegin; 2709ae82921SPaul Mullowney 2719ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 2729ae82921SPaul Mullowney try { 2739ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 2749ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 2759ae82921SPaul Mullowney 2769ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 2779ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2789ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 2799ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 2809ae82921SPaul Mullowney 2819ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 2829ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 2839ae82921SPaul Mullowney AiUp[n]=nzUpper; 2849ae82921SPaul Mullowney offset = nzUpper; 2859ae82921SPaul Mullowney for (i=n-1; i>=0; i--){ 2869ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 2879ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 2889ae82921SPaul Mullowney 289*e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 2909ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 2919ae82921SPaul Mullowney 292*e057df02SPaul Mullowney /* decrement the offset */ 2939ae82921SPaul Mullowney offset -= (nz+1); 2949ae82921SPaul Mullowney 295*e057df02SPaul Mullowney /* first, set the diagonal elements */ 2969ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 2979ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 2989ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 2999ae82921SPaul Mullowney 3009ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3019ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3029ae82921SPaul Mullowney } 303*e057df02SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseTriFactors->format]); 3049ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 3059ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 3069ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 3079ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 3089ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 3099ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 3109ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 3119ae82921SPaul Mullowney } catch(char* ex) { 3129ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3139ae82921SPaul Mullowney } 3149ae82921SPaul Mullowney } 3159ae82921SPaul Mullowney PetscFunctionReturn(0); 3169ae82921SPaul Mullowney } 3179ae82921SPaul Mullowney 3189ae82921SPaul Mullowney #undef __FUNCT__ 319*e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalysiAndCopyToGPU" 320*e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(Mat A) 3219ae82921SPaul Mullowney { 3229ae82921SPaul Mullowney PetscErrorCode ierr; 3239ae82921SPaul Mullowney Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 3249ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3259ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 3269ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3279ae82921SPaul Mullowney const PetscInt *r,*c; 3289ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3299ae82921SPaul Mullowney 3309ae82921SPaul Mullowney PetscFunctionBegin; 331*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 332*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 3339ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 3349ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 3359ae82921SPaul Mullowney 3369ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 337*e057df02SPaul Mullowney /*lower triangular indices */ 3389ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3399ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3409ae82921SPaul Mullowney if (!row_identity) 3419ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 3429ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3439ae82921SPaul Mullowney 344*e057df02SPaul Mullowney /*upper triangular indices */ 3459ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 3469ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3479ae82921SPaul Mullowney if (!col_identity) 3489ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 3499ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 3509ae82921SPaul Mullowney PetscFunctionReturn(0); 3519ae82921SPaul Mullowney } 3529ae82921SPaul Mullowney 3539ae82921SPaul Mullowney #undef __FUNCT__ 3549ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 3559ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 3569ae82921SPaul Mullowney { 3579ae82921SPaul Mullowney PetscErrorCode ierr; 3589ae82921SPaul Mullowney Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 3599ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 3609ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3619ae82921SPaul Mullowney 3629ae82921SPaul Mullowney PetscFunctionBegin; 3639ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 364*e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 3659ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3669ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3679ae82921SPaul Mullowney if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 3689ae82921SPaul Mullowney else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 3698dc1d2a3SPaul Mullowney 370*e057df02SPaul Mullowney /* get the triangular factors */ 371*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 3729ae82921SPaul Mullowney PetscFunctionReturn(0); 3739ae82921SPaul Mullowney } 3749ae82921SPaul Mullowney 3759ae82921SPaul Mullowney 3769ae82921SPaul Mullowney 3779ae82921SPaul Mullowney #undef __FUNCT__ 3789ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 3799ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 3809ae82921SPaul Mullowney { 3819ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3829ae82921SPaul Mullowney PetscErrorCode ierr; 3839ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 3849ae82921SPaul Mullowney cusparseStatus_t stat; 3859ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3869ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 3879ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 3889ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 3899ae82921SPaul Mullowney 3909ae82921SPaul Mullowney PetscFunctionBegin; 391*e057df02SPaul Mullowney /* Get the GPU pointers */ 3929ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 3939ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 3949ae82921SPaul Mullowney 395*e057df02SPaul Mullowney /* solve with reordering */ 3969ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 3979ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 3989ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 3999ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 4009ae82921SPaul Mullowney 4019ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4029ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4039ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4049ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4059ae82921SPaul Mullowney PetscFunctionReturn(0); 4069ae82921SPaul Mullowney } 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney 4099ae82921SPaul Mullowney 4109ae82921SPaul Mullowney 4119ae82921SPaul Mullowney #undef __FUNCT__ 4129ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 4139ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 4149ae82921SPaul Mullowney { 4159ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4169ae82921SPaul Mullowney PetscErrorCode ierr; 4179ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 4189ae82921SPaul Mullowney cusparseStatus_t stat; 4199ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4209ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4219ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4229ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4239ae82921SPaul Mullowney 4249ae82921SPaul Mullowney PetscFunctionBegin; 425*e057df02SPaul Mullowney /* Get the GPU pointers */ 4269ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4279ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4289ae82921SPaul Mullowney 429*e057df02SPaul Mullowney /* solve */ 4309ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 4319ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4329ae82921SPaul Mullowney 4339ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4349ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4359ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4369ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4379ae82921SPaul Mullowney PetscFunctionReturn(0); 4389ae82921SPaul Mullowney } 4399ae82921SPaul Mullowney 4409ae82921SPaul Mullowney #undef __FUNCT__ 441*e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 442*e057df02SPaul Mullowney PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 4439ae82921SPaul Mullowney { 4449ae82921SPaul Mullowney 4459ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 4469ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4479ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 4489ae82921SPaul Mullowney PetscErrorCode ierr; 4499ae82921SPaul Mullowney 4509ae82921SPaul Mullowney 4519ae82921SPaul Mullowney PetscFunctionBegin; 4529ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 4539ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 4549ae82921SPaul Mullowney /* 4559ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 4569ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 4579ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 4589ae82921SPaul Mullowney */ 4599ae82921SPaul Mullowney if (cusparseMat->mat){ 4609ae82921SPaul Mullowney try { 4619ae82921SPaul Mullowney delete cusparseMat->mat; 4629ae82921SPaul Mullowney if (cusparseMat->tempvec) 4639ae82921SPaul Mullowney delete cusparseMat->tempvec; 4649ae82921SPaul Mullowney 4659ae82921SPaul Mullowney } catch(char* ex) { 4669ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4679ae82921SPaul Mullowney } 4689ae82921SPaul Mullowney } 4699ae82921SPaul Mullowney try { 4709ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 4719ae82921SPaul Mullowney for (int j = 0; j<m; j++) 4729ae82921SPaul Mullowney cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 4739ae82921SPaul Mullowney 4749ae82921SPaul Mullowney if (a->compressedrow.use) { 4759ae82921SPaul Mullowney m = a->compressedrow.nrows; 4769ae82921SPaul Mullowney ii = a->compressedrow.i; 4779ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 4789ae82921SPaul Mullowney } else { 479*e057df02SPaul Mullowney /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 4809ae82921SPaul Mullowney int k=0; 4819ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 4829ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 4839ae82921SPaul Mullowney ii[0]=0; 4849ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 4859ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 4869ae82921SPaul Mullowney ii[k] = a->i[j]; 4879ae82921SPaul Mullowney ridx[k]= j; 4889ae82921SPaul Mullowney k++; 4899ae82921SPaul Mullowney } 4909ae82921SPaul Mullowney } 4919ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 4929ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 4939ae82921SPaul Mullowney } 4949ae82921SPaul Mullowney 495*e057df02SPaul Mullowney /* Build our matrix ... first determine the GPU storage type */ 496*e057df02SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(MatCUSPARSEStorageFormats[cusparseMat->format]); 4979ae82921SPaul Mullowney 498*e057df02SPaul Mullowney /* Create the streams and events (if desired). */ 4999ae82921SPaul Mullowney PetscMPIInt size; 5009ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 5019ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 5029ae82921SPaul Mullowney 503*e057df02SPaul Mullowney /* FILL MODE UPPER is irrelevant */ 504ca45077fSPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 505ca45077fSPaul Mullowney 506*e057df02SPaul Mullowney /* lastly, build the matrix */ 5079ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 5089ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 5099ae82921SPaul Mullowney if (!a->compressedrow.use) { 5109ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 5119ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 5129ae82921SPaul Mullowney } 5139ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 5149ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 5159ae82921SPaul Mullowney } catch(char* ex) { 5169ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5179ae82921SPaul Mullowney } 5189ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 5199ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 5209ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 5219ae82921SPaul Mullowney } 5229ae82921SPaul Mullowney PetscFunctionReturn(0); 5239ae82921SPaul Mullowney } 5249ae82921SPaul Mullowney 5259ae82921SPaul Mullowney #undef __FUNCT__ 5269ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 5279ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 5289ae82921SPaul Mullowney { 5299ae82921SPaul Mullowney PetscErrorCode ierr; 5309ae82921SPaul Mullowney 5319ae82921SPaul Mullowney PetscFunctionBegin; 5329ae82921SPaul Mullowney 5339ae82921SPaul Mullowney if (right) { 5349ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 5359ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5369ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 5379ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 5389ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 5399ae82921SPaul Mullowney } 5409ae82921SPaul Mullowney if (left) { 5419ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 5429ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 5439ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 5449ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 5459ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 5469ae82921SPaul Mullowney } 5479ae82921SPaul Mullowney PetscFunctionReturn(0); 5489ae82921SPaul Mullowney } 5499ae82921SPaul Mullowney 5509ae82921SPaul Mullowney #undef __FUNCT__ 5519ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 5529ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 5539ae82921SPaul Mullowney { 5549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5559ae82921SPaul Mullowney PetscErrorCode ierr; 5569ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 5579ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray; 5589ae82921SPaul Mullowney 5599ae82921SPaul Mullowney PetscFunctionBegin; 560*e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 561*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 5629ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 5639ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 5649ae82921SPaul Mullowney try { 5659ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 5669ae82921SPaul Mullowney } catch (char* ex) { 5679ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5689ae82921SPaul Mullowney } 5699ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 5709ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 571ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 5729ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 573ca45077fSPaul Mullowney } 5749ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 5759ae82921SPaul Mullowney PetscFunctionReturn(0); 5769ae82921SPaul Mullowney } 5779ae82921SPaul Mullowney 5789ae82921SPaul Mullowney 5799ae82921SPaul Mullowney #undef __FUNCT__ 580ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 581ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 582ca45077fSPaul Mullowney { 583ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 584ca45077fSPaul Mullowney PetscErrorCode ierr; 585ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 586ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray; 587ca45077fSPaul Mullowney 588ca45077fSPaul Mullowney PetscFunctionBegin; 589*e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 590*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 591ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 592ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 593ca45077fSPaul Mullowney try { 594ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 595ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 596ca45077fSPaul Mullowney #else 597ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 598ca45077fSPaul Mullowney #endif 599ca45077fSPaul Mullowney } catch (char* ex) { 600ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 601ca45077fSPaul Mullowney } 602ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 603ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 604ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 605ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 606ca45077fSPaul Mullowney } 607ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 608ca45077fSPaul Mullowney PetscFunctionReturn(0); 609ca45077fSPaul Mullowney } 610ca45077fSPaul Mullowney 611ca45077fSPaul Mullowney #undef __FUNCT__ 6129ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 6139ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 6149ae82921SPaul Mullowney { 6159ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6169ae82921SPaul Mullowney PetscErrorCode ierr; 6179ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 6189ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 6199ae82921SPaul Mullowney PetscFunctionBegin; 620*e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 621*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 6229ae82921SPaul Mullowney try { 6239ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 6249ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 6259ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 6269ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 6279ae82921SPaul Mullowney 628*e057df02SPaul Mullowney /* multiply add */ 6299ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 6309ae82921SPaul Mullowney 6319ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 6329ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 6339ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 6349ae82921SPaul Mullowney 6359ae82921SPaul Mullowney } catch(char* ex) { 6369ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6379ae82921SPaul Mullowney } 6389ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 6399ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 6409ae82921SPaul Mullowney PetscFunctionReturn(0); 6419ae82921SPaul Mullowney } 6429ae82921SPaul Mullowney 6439ae82921SPaul Mullowney #undef __FUNCT__ 644ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 645ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 646ca45077fSPaul Mullowney { 647ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 648ca45077fSPaul Mullowney PetscErrorCode ierr; 649ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 650ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 651ca45077fSPaul Mullowney PetscFunctionBegin; 652*e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 653*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 654ca45077fSPaul Mullowney try { 655ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 656ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 657ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 658ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 659ca45077fSPaul Mullowney 660*e057df02SPaul Mullowney /* multiply add with matrix transpose */ 661ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 662ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 663ca45077fSPaul Mullowney #else 664ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 665ca45077fSPaul Mullowney #endif 666ca45077fSPaul Mullowney 667ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 668ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 669ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 670ca45077fSPaul Mullowney 671ca45077fSPaul Mullowney } catch(char* ex) { 672ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 673ca45077fSPaul Mullowney } 674ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 675ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 676ca45077fSPaul Mullowney PetscFunctionReturn(0); 677ca45077fSPaul Mullowney } 678ca45077fSPaul Mullowney 679ca45077fSPaul Mullowney #undef __FUNCT__ 6809ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 6819ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 6829ae82921SPaul Mullowney { 6839ae82921SPaul Mullowney PetscErrorCode ierr; 6849ae82921SPaul Mullowney PetscFunctionBegin; 6859ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 686*e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6879ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 688bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 689bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 690bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 691bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 6929ae82921SPaul Mullowney PetscFunctionReturn(0); 6939ae82921SPaul Mullowney } 6949ae82921SPaul Mullowney 6959ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 6969ae82921SPaul Mullowney #undef __FUNCT__ 6979ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 698*e057df02SPaul Mullowney /*@ 6999ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 700*e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 701*e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 702*e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 703*e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 704*e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7059ae82921SPaul Mullowney 7069ae82921SPaul Mullowney Collective on MPI_Comm 7079ae82921SPaul Mullowney 7089ae82921SPaul Mullowney Input Parameters: 7099ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 7109ae82921SPaul Mullowney . m - number of rows 7119ae82921SPaul Mullowney . n - number of columns 7129ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 7139ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7149ae82921SPaul Mullowney (possibly different for each row) or PETSC_NULL 7159ae82921SPaul Mullowney 7169ae82921SPaul Mullowney Output Parameter: 7179ae82921SPaul Mullowney . A - the matrix 7189ae82921SPaul Mullowney 7199ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 7209ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 7219ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 7229ae82921SPaul Mullowney 7239ae82921SPaul Mullowney Notes: 7249ae82921SPaul Mullowney If nnz is given then nz is ignored 7259ae82921SPaul Mullowney 7269ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 7279ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 7289ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 7299ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 7309ae82921SPaul Mullowney 7319ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7329ae82921SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 7339ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 7349ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 7359ae82921SPaul Mullowney 7369ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 7379ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 7389ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 7399ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 7409ae82921SPaul Mullowney 7419ae82921SPaul Mullowney Level: intermediate 7429ae82921SPaul Mullowney 743*e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 7449ae82921SPaul Mullowney @*/ 7459ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 7469ae82921SPaul Mullowney { 7479ae82921SPaul Mullowney PetscErrorCode ierr; 7489ae82921SPaul Mullowney 7499ae82921SPaul Mullowney PetscFunctionBegin; 7509ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7519ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 7529ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7539ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 7549ae82921SPaul Mullowney PetscFunctionReturn(0); 7559ae82921SPaul Mullowney } 7569ae82921SPaul Mullowney 7579ae82921SPaul Mullowney #undef __FUNCT__ 7589ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 7599ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 7609ae82921SPaul Mullowney { 7619ae82921SPaul Mullowney PetscErrorCode ierr; 7629ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 7639ae82921SPaul Mullowney 7649ae82921SPaul Mullowney PetscFunctionBegin; 7659ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 7669ae82921SPaul Mullowney try { 7679ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 7689ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 7699ae82921SPaul Mullowney } 7709ae82921SPaul Mullowney if (cusparseMat->tempvec!=0) 7719ae82921SPaul Mullowney delete cusparseMat->tempvec; 7729ae82921SPaul Mullowney delete cusparseMat; 7739ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 7749ae82921SPaul Mullowney } catch(char* ex) { 7759ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7769ae82921SPaul Mullowney } 7779ae82921SPaul Mullowney } else { 778*e057df02SPaul Mullowney /* The triangular factors */ 7799ae82921SPaul Mullowney try { 7809ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 7819ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 7829ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 7839ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 7849ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 7859ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 7869ae82921SPaul Mullowney delete cusparseTriFactors; 7879ae82921SPaul Mullowney } catch(char* ex) { 7889ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7899ae82921SPaul Mullowney } 7909ae82921SPaul Mullowney } 7919ae82921SPaul Mullowney if (MAT_cusparseHandle) { 7929ae82921SPaul Mullowney cusparseStatus_t stat; 7939ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 7949ae82921SPaul Mullowney MAT_cusparseHandle=0; 7959ae82921SPaul Mullowney } 7969ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 7979ae82921SPaul Mullowney A->spptr = 0; 7989ae82921SPaul Mullowney 7999ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 8009ae82921SPaul Mullowney PetscFunctionReturn(0); 8019ae82921SPaul Mullowney } 8029ae82921SPaul Mullowney 8039ae82921SPaul Mullowney EXTERN_C_BEGIN 8049ae82921SPaul Mullowney #undef __FUNCT__ 8059ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 8069ae82921SPaul Mullowney PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 8079ae82921SPaul Mullowney { 8089ae82921SPaul Mullowney PetscErrorCode ierr; 8099ae82921SPaul Mullowney 8109ae82921SPaul Mullowney PetscFunctionBegin; 8119ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 8129ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 813*e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 814*e057df02SPaul Mullowney now build a GPU matrix data structure */ 8159ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 8169ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 8179ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 818*e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = MAT_CUSPARSE_CSR; 8199ae82921SPaul Mullowney } else { 8209ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 821debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 8229ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 8239ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 8249ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 825*e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = cusparseMatSolveStorageFormat; 8269ae82921SPaul Mullowney } 827*e057df02SPaul Mullowney /* Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) */ 8289ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 8299ae82921SPaul Mullowney cusparseStatus_t stat; 8309ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 8319ae82921SPaul Mullowney } 832*e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 833*e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 834*e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 8359ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 8369ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 8379ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 8389ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 8399ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 840ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 841ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 842ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 843ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 8449ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 8459ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 846*e057df02SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B, "MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_SeqAIJCUSPARSE", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 8479ae82921SPaul Mullowney PetscFunctionReturn(0); 8489ae82921SPaul Mullowney } 8499ae82921SPaul Mullowney EXTERN_C_END 8509ae82921SPaul Mullowney 851*e057df02SPaul Mullowney /*M 852*e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 853*e057df02SPaul Mullowney 854*e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 855*e057df02SPaul Mullowney CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 856*e057df02SPaul Mullowney the CUSPARSE library. This type is only available when using the 'txpetscgpu' package. 857*e057df02SPaul Mullowney Use --download-txpetscgpu to build/install PETSc to use different CUSPARSE library and 858*e057df02SPaul Mullowney the different GPU storage formats. 859*e057df02SPaul Mullowney 860*e057df02SPaul Mullowney Options Database Keys: 861*e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 862*e057df02SPaul Mullowney . -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 863*e057df02SPaul Mullowney . -mat_cusparse_mult_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 864*e057df02SPaul Mullowney - -mat_cusparse_solve_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Only available with 'txpetscgpu' package. 865*e057df02SPaul Mullowney 866*e057df02SPaul Mullowney Level: beginner 867*e057df02SPaul Mullowney 868*e057df02SPaul Mullowney .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 869*e057df02SPaul Mullowney M*/ 870