19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3bc3f50f2SPaul Mullowney matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 99ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 109ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 119ae82921SPaul Mullowney #undef VecType 129ae82921SPaul Mullowney #include "cusparsematimpl.h" 13bc3f50f2SPaul Mullowney 14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 159ae82921SPaul Mullowney 16087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19087f3262SPaul Mullowney 206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 216fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 226fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23087f3262SPaul Mullowney 246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 286fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 296fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 306fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 316fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 326fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 339ae82921SPaul Mullowney 349ae82921SPaul Mullowney #undef __FUNCT__ 359ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 369ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 379ae82921SPaul Mullowney { 389ae82921SPaul Mullowney PetscFunctionBegin; 399ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 409ae82921SPaul Mullowney PetscFunctionReturn(0); 419ae82921SPaul Mullowney } 429ae82921SPaul Mullowney 43c708e6cdSJed Brown /*MC 44087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 45087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 46087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 47087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 48087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 49087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 50c708e6cdSJed Brown 51087f3262SPaul Mullowney Consult CUSPARSE documentation for more information about the matrix storage formats 52087f3262SPaul Mullowney which correspond to the options database keys below. 53c708e6cdSJed Brown 54c708e6cdSJed Brown Options Database Keys: 55*aa372e3fSPaul Mullowney . -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 569ae82921SPaul Mullowney 579ae82921SPaul Mullowney Level: beginner 58c708e6cdSJed Brown 59c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 60c708e6cdSJed Brown M*/ 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney #undef __FUNCT__ 639ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 643c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 659ae82921SPaul Mullowney { 669ae82921SPaul Mullowney PetscErrorCode ierr; 67bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 689ae82921SPaul Mullowney 699ae82921SPaul Mullowney PetscFunctionBegin; 70bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 71404133a2SPaul Mullowney (*B)->factortype = ftype; 72bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 739ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 742205254eSKarl Rupp 75087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 76bc3f50f2SPaul Mullowney ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 779ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 789ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 79087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 80087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 81087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 829ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 83bc3f50f2SPaul Mullowney 84fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 8562a20339SJed Brown ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 869ae82921SPaul Mullowney PetscFunctionReturn(0); 879ae82921SPaul Mullowney } 889ae82921SPaul Mullowney 899ae82921SPaul Mullowney #undef __FUNCT__ 90e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 91bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 92ca45077fSPaul Mullowney { 93*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 946e111a19SKarl Rupp 95ca45077fSPaul Mullowney PetscFunctionBegin; 96ca45077fSPaul Mullowney switch (op) { 97e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 98*aa372e3fSPaul Mullowney cusparsestruct->format = format; 99ca45077fSPaul Mullowney break; 100e057df02SPaul Mullowney case MAT_CUSPARSE_SOLVE: 101e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 102ca45077fSPaul Mullowney break; 103e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 104*aa372e3fSPaul Mullowney cusparsestruct->format = format; 105e057df02SPaul Mullowney cusparseMatSolveStorageFormat = format; 106ca45077fSPaul Mullowney break; 107ca45077fSPaul Mullowney default: 108e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op); 109ca45077fSPaul Mullowney } 110ca45077fSPaul Mullowney PetscFunctionReturn(0); 111ca45077fSPaul Mullowney } 1129ae82921SPaul Mullowney 113e057df02SPaul Mullowney /*@ 114e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 116*aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 117e057df02SPaul Mullowney Not Collective 118e057df02SPaul Mullowney 119e057df02SPaul Mullowney Input Parameters: 1208468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 1218468deeeSKarl Rupp . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1228468deeeSKarl Rupp - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB) 123e057df02SPaul Mullowney 124e057df02SPaul Mullowney Output Parameter: 125e057df02SPaul Mullowney 126e057df02SPaul Mullowney Level: intermediate 127e057df02SPaul Mullowney 1288468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 129e057df02SPaul Mullowney @*/ 130e057df02SPaul Mullowney #undef __FUNCT__ 131e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 132e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 133e057df02SPaul Mullowney { 134e057df02SPaul Mullowney PetscErrorCode ierr; 1356e111a19SKarl Rupp 136e057df02SPaul Mullowney PetscFunctionBegin; 137e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 138e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 139e057df02SPaul Mullowney PetscFunctionReturn(0); 140e057df02SPaul Mullowney } 141e057df02SPaul Mullowney 1429ae82921SPaul Mullowney #undef __FUNCT__ 1439ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1446fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1459ae82921SPaul Mullowney { 1469ae82921SPaul Mullowney PetscErrorCode ierr; 147e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1489ae82921SPaul Mullowney PetscBool flg; 1496e111a19SKarl Rupp 1509ae82921SPaul Mullowney PetscFunctionBegin; 151e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 152e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1539ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 154e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 1557083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 156e057df02SPaul Mullowney if (flg) { 157e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 158045c96e1SPaul Mullowney } 1592205254eSKarl Rupp } else { 160e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve", 1617083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 162e057df02SPaul Mullowney if (flg) { 163e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr); 164e057df02SPaul Mullowney } 1659ae82921SPaul Mullowney } 1664c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 1677083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 1684c87dfd4SPaul Mullowney if (flg) { 1694c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 1704c87dfd4SPaul Mullowney } 1719ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1729ae82921SPaul Mullowney PetscFunctionReturn(0); 1739ae82921SPaul Mullowney 1749ae82921SPaul Mullowney } 1759ae82921SPaul Mullowney 1769ae82921SPaul Mullowney #undef __FUNCT__ 1779ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1786fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1799ae82921SPaul Mullowney { 1809ae82921SPaul Mullowney PetscErrorCode ierr; 1819ae82921SPaul Mullowney 1829ae82921SPaul Mullowney PetscFunctionBegin; 1839ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1849ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1859ae82921SPaul Mullowney PetscFunctionReturn(0); 1869ae82921SPaul Mullowney } 1879ae82921SPaul Mullowney 1889ae82921SPaul Mullowney #undef __FUNCT__ 1899ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 1906fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1919ae82921SPaul Mullowney { 1929ae82921SPaul Mullowney PetscErrorCode ierr; 1939ae82921SPaul Mullowney 1949ae82921SPaul Mullowney PetscFunctionBegin; 1959ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 1969ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1979ae82921SPaul Mullowney PetscFunctionReturn(0); 1989ae82921SPaul Mullowney } 1999ae82921SPaul Mullowney 2009ae82921SPaul Mullowney #undef __FUNCT__ 201087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 202087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 203087f3262SPaul Mullowney { 204087f3262SPaul Mullowney PetscErrorCode ierr; 205087f3262SPaul Mullowney 206087f3262SPaul Mullowney PetscFunctionBegin; 207087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 208087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 209087f3262SPaul Mullowney PetscFunctionReturn(0); 210087f3262SPaul Mullowney } 211087f3262SPaul Mullowney 212087f3262SPaul Mullowney #undef __FUNCT__ 213087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 214087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 215087f3262SPaul Mullowney { 216087f3262SPaul Mullowney PetscErrorCode ierr; 217087f3262SPaul Mullowney 218087f3262SPaul Mullowney PetscFunctionBegin; 219087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 220087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 221087f3262SPaul Mullowney PetscFunctionReturn(0); 222087f3262SPaul Mullowney } 223087f3262SPaul Mullowney 224087f3262SPaul Mullowney #undef __FUNCT__ 225087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 226087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2279ae82921SPaul Mullowney { 2289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2299ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2309ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 231*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2329ae82921SPaul Mullowney cusparseStatus_t stat; 2339ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2349ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2359ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2369ae82921SPaul Mullowney PetscScalar *AALo; 2379ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 238b175d8bbSPaul Mullowney PetscErrorCode ierr; 2399ae82921SPaul Mullowney 2409ae82921SPaul Mullowney PetscFunctionBegin; 2419ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2429ae82921SPaul Mullowney try { 2439ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2449ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2459ae82921SPaul Mullowney 2469ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2479ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2489ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2499ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2509ae82921SPaul Mullowney 2519ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2529ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2539ae82921SPaul Mullowney AiLo[n] = nzLower; 2549ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2559ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2569ae82921SPaul Mullowney v = aa; 2579ae82921SPaul Mullowney vi = aj; 2589ae82921SPaul Mullowney offset = 1; 2599ae82921SPaul Mullowney rowOffset= 1; 2609ae82921SPaul Mullowney for (i=1; i<n; i++) { 2619ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 262e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2639ae82921SPaul Mullowney AiLo[i] = rowOffset; 2649ae82921SPaul Mullowney rowOffset += nz+1; 2659ae82921SPaul Mullowney 2669ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2679ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2689ae82921SPaul Mullowney 2699ae82921SPaul Mullowney offset += nz; 2709ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 2719ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 2729ae82921SPaul Mullowney offset += 1; 2739ae82921SPaul Mullowney 2749ae82921SPaul Mullowney v += nz; 2759ae82921SPaul Mullowney vi += nz; 2769ae82921SPaul Mullowney } 2772205254eSKarl Rupp 278*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 279*aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 2802205254eSKarl Rupp 281*aa372e3fSPaul Mullowney /* Create the matrix description */ 282*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 283*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 284*aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 285*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 286*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 287*aa372e3fSPaul Mullowney 288*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 289*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 290*aa372e3fSPaul Mullowney 291*aa372e3fSPaul Mullowney /* set the operation */ 292*aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 293*aa372e3fSPaul Mullowney 294*aa372e3fSPaul Mullowney /* set the matrix */ 295*aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 296*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 297*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 298*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 299*aa372e3fSPaul Mullowney 300*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 301*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 302*aa372e3fSPaul Mullowney 303*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 304*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 305*aa372e3fSPaul Mullowney 306*aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 307*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 308*aa372e3fSPaul Mullowney 309*aa372e3fSPaul Mullowney /* perform the solve analysis */ 310*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 311*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 312*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 313*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 314*aa372e3fSPaul Mullowney 315*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 316*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3172205254eSKarl Rupp 3189ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 3199ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 3209ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 3219ae82921SPaul Mullowney } catch(char *ex) { 3229ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3239ae82921SPaul Mullowney } 3249ae82921SPaul Mullowney } 3259ae82921SPaul Mullowney PetscFunctionReturn(0); 3269ae82921SPaul Mullowney } 3279ae82921SPaul Mullowney 3289ae82921SPaul Mullowney #undef __FUNCT__ 329087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 330087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 3319ae82921SPaul Mullowney { 3329ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3339ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3349ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 335*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 3369ae82921SPaul Mullowney cusparseStatus_t stat; 3379ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 3389ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3399ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3409ae82921SPaul Mullowney PetscScalar *AAUp; 3419ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 3429ae82921SPaul Mullowney PetscErrorCode ierr; 3439ae82921SPaul Mullowney 3449ae82921SPaul Mullowney PetscFunctionBegin; 3459ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 3469ae82921SPaul Mullowney try { 3479ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3489ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3499ae82921SPaul Mullowney 3509ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 3519ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 3529ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 3539ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 3549ae82921SPaul Mullowney 3559ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 3569ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 3579ae82921SPaul Mullowney AiUp[n]=nzUpper; 3589ae82921SPaul Mullowney offset = nzUpper; 3599ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 3609ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 3619ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3629ae82921SPaul Mullowney 363e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 3649ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3659ae82921SPaul Mullowney 366e057df02SPaul Mullowney /* decrement the offset */ 3679ae82921SPaul Mullowney offset -= (nz+1); 3689ae82921SPaul Mullowney 369e057df02SPaul Mullowney /* first, set the diagonal elements */ 3709ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 3719ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 3729ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3739ae82921SPaul Mullowney 3749ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3759ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3769ae82921SPaul Mullowney } 3772205254eSKarl Rupp 378*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 379*aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3802205254eSKarl Rupp 381*aa372e3fSPaul Mullowney /* Create the matrix description */ 382*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 383*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 384*aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 385*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 386*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 387*aa372e3fSPaul Mullowney 388*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 389*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 390*aa372e3fSPaul Mullowney 391*aa372e3fSPaul Mullowney /* set the operation */ 392*aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 393*aa372e3fSPaul Mullowney 394*aa372e3fSPaul Mullowney /* set the matrix */ 395*aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 396*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 397*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 398*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 399*aa372e3fSPaul Mullowney 400*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 401*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 402*aa372e3fSPaul Mullowney 403*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 404*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 405*aa372e3fSPaul Mullowney 406*aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 407*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 408*aa372e3fSPaul Mullowney 409*aa372e3fSPaul Mullowney /* perform the solve analysis */ 410*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 411*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 412*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 413*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 414*aa372e3fSPaul Mullowney 415*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 416*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 4172205254eSKarl Rupp 4189ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 4199ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 4209ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 4219ae82921SPaul Mullowney } catch(char *ex) { 4229ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4239ae82921SPaul Mullowney } 4249ae82921SPaul Mullowney } 4259ae82921SPaul Mullowney PetscFunctionReturn(0); 4269ae82921SPaul Mullowney } 4279ae82921SPaul Mullowney 4289ae82921SPaul Mullowney #undef __FUNCT__ 429087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 430087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 4319ae82921SPaul Mullowney { 4329ae82921SPaul Mullowney PetscErrorCode ierr; 4339ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4349ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4359ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 4369ae82921SPaul Mullowney PetscBool row_identity,col_identity; 4379ae82921SPaul Mullowney const PetscInt *r,*c; 4389ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4399ae82921SPaul Mullowney 4409ae82921SPaul Mullowney PetscFunctionBegin; 441087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 442087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 4432205254eSKarl Rupp 444*aa372e3fSPaul Mullowney cusparseTriFactors->workVector = new THRUSTARRAY; 445*aa372e3fSPaul Mullowney cusparseTriFactors->workVector->resize(n); 446*aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 4479ae82921SPaul Mullowney 4489ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 449e057df02SPaul Mullowney /*lower triangular indices */ 4509ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4519ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4522205254eSKarl Rupp if (!row_identity) { 453*aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 454*aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 4552205254eSKarl Rupp } 4569ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4579ae82921SPaul Mullowney 458e057df02SPaul Mullowney /*upper triangular indices */ 4599ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 4609ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4612205254eSKarl Rupp if (!col_identity) { 462*aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 463*aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 4642205254eSKarl Rupp } 4659ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 4669ae82921SPaul Mullowney PetscFunctionReturn(0); 4679ae82921SPaul Mullowney } 4689ae82921SPaul Mullowney 4699ae82921SPaul Mullowney #undef __FUNCT__ 470087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 471087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 472087f3262SPaul Mullowney { 473087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 474087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 475*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 476*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 477087f3262SPaul Mullowney cusparseStatus_t stat; 478087f3262SPaul Mullowney PetscErrorCode ierr; 479087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 480087f3262SPaul Mullowney PetscScalar *AAUp; 481087f3262SPaul Mullowney PetscScalar *AALo; 482087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 483087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 484087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 485087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 486087f3262SPaul Mullowney 487087f3262SPaul Mullowney PetscFunctionBegin; 488087f3262SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 489087f3262SPaul Mullowney try { 490087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 491087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 492087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 493087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 494087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 495087f3262SPaul Mullowney 496087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 497087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 498087f3262SPaul Mullowney AiUp[n]=nzUpper; 499087f3262SPaul Mullowney offset = 0; 500087f3262SPaul Mullowney for (i=0; i<n; i++) { 501087f3262SPaul Mullowney /* set the pointers */ 502087f3262SPaul Mullowney v = aa + ai[i]; 503087f3262SPaul Mullowney vj = aj + ai[i]; 504087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 505087f3262SPaul Mullowney 506087f3262SPaul Mullowney /* first, set the diagonal elements */ 507087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 508087f3262SPaul Mullowney AAUp[offset] = 1.0/v[nz]; 509087f3262SPaul Mullowney AiUp[i] = offset; 510087f3262SPaul Mullowney AALo[offset] = 1.0/v[nz]; 511087f3262SPaul Mullowney 512087f3262SPaul Mullowney offset+=1; 513087f3262SPaul Mullowney if (nz>0) { 514087f3262SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 515087f3262SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 516087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 517087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 518087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 519087f3262SPaul Mullowney } 520087f3262SPaul Mullowney offset+=nz; 521087f3262SPaul Mullowney } 522087f3262SPaul Mullowney } 523087f3262SPaul Mullowney 524*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 525*aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 526087f3262SPaul Mullowney 527*aa372e3fSPaul Mullowney /* Create the matrix description */ 528*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 529*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 530*aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 531*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 532*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 533087f3262SPaul Mullowney 534*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 535*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 536*aa372e3fSPaul Mullowney 537*aa372e3fSPaul Mullowney /* set the operation */ 538*aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 539*aa372e3fSPaul Mullowney 540*aa372e3fSPaul Mullowney /* set the matrix */ 541*aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 542*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 543*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 544*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 545*aa372e3fSPaul Mullowney 546*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 547*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 548*aa372e3fSPaul Mullowney 549*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 550*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 551*aa372e3fSPaul Mullowney 552*aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 553*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 554*aa372e3fSPaul Mullowney 555*aa372e3fSPaul Mullowney /* perform the solve analysis */ 556*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 557*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 558*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 559*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 560*aa372e3fSPaul Mullowney 561*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 562*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 563*aa372e3fSPaul Mullowney 564*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 565*aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 566*aa372e3fSPaul Mullowney 567*aa372e3fSPaul Mullowney /* Create the matrix description */ 568*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 569*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 570*aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 571*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 572*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 573*aa372e3fSPaul Mullowney 574*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 575*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 576*aa372e3fSPaul Mullowney 577*aa372e3fSPaul Mullowney /* set the operation */ 578*aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 579*aa372e3fSPaul Mullowney 580*aa372e3fSPaul Mullowney /* set the matrix */ 581*aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 582*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 583*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 584*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 585*aa372e3fSPaul Mullowney 586*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 587*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 588*aa372e3fSPaul Mullowney 589*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 590*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 591*aa372e3fSPaul Mullowney 592*aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 593*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 594*aa372e3fSPaul Mullowney 595*aa372e3fSPaul Mullowney /* perform the solve analysis */ 596*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 597*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 598*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 599*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 600*aa372e3fSPaul Mullowney 601*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 602*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 603087f3262SPaul Mullowney 604087f3262SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 605087f3262SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 606087f3262SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 607087f3262SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 608087f3262SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 609087f3262SPaul Mullowney } catch(char *ex) { 610087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 611087f3262SPaul Mullowney } 612087f3262SPaul Mullowney } 613087f3262SPaul Mullowney PetscFunctionReturn(0); 614087f3262SPaul Mullowney } 615087f3262SPaul Mullowney 616087f3262SPaul Mullowney #undef __FUNCT__ 617087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 618087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 6199ae82921SPaul Mullowney { 6209ae82921SPaul Mullowney PetscErrorCode ierr; 621087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 622087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 623087f3262SPaul Mullowney IS ip = a->row; 624087f3262SPaul Mullowney const PetscInt *rip; 625087f3262SPaul Mullowney PetscBool perm_identity; 626087f3262SPaul Mullowney PetscInt n = A->rmap->n; 627087f3262SPaul Mullowney 628087f3262SPaul Mullowney PetscFunctionBegin; 629087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 630*aa372e3fSPaul Mullowney cusparseTriFactors->workVector = new THRUSTARRAY; 631*aa372e3fSPaul Mullowney cusparseTriFactors->workVector->resize(n); 632*aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 633*aa372e3fSPaul Mullowney 634087f3262SPaul Mullowney /*lower triangular indices */ 635087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 636087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 637087f3262SPaul Mullowney if (!perm_identity) { 638*aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 639*aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 640*aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 641*aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(rip, rip+n); 642087f3262SPaul Mullowney } 643087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 644087f3262SPaul Mullowney PetscFunctionReturn(0); 645087f3262SPaul Mullowney } 646087f3262SPaul Mullowney 647087f3262SPaul Mullowney #undef __FUNCT__ 6489ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 6496fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 6509ae82921SPaul Mullowney { 6519ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 6529ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 6539ae82921SPaul Mullowney PetscBool row_identity,col_identity; 654b175d8bbSPaul Mullowney PetscErrorCode ierr; 6559ae82921SPaul Mullowney 6569ae82921SPaul Mullowney PetscFunctionBegin; 6579ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 658e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 6599ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6609ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 661bda325fcSPaul Mullowney if (row_identity && col_identity) { 662bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 663bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 664bda325fcSPaul Mullowney } else { 665bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 666bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 667bda325fcSPaul Mullowney } 6688dc1d2a3SPaul Mullowney 669e057df02SPaul Mullowney /* get the triangular factors */ 670087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 6719ae82921SPaul Mullowney PetscFunctionReturn(0); 6729ae82921SPaul Mullowney } 6739ae82921SPaul Mullowney 674087f3262SPaul Mullowney #undef __FUNCT__ 675087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 676087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 677087f3262SPaul Mullowney { 678087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 679087f3262SPaul Mullowney IS ip = b->row; 680087f3262SPaul Mullowney PetscBool perm_identity; 681b175d8bbSPaul Mullowney PetscErrorCode ierr; 682087f3262SPaul Mullowney 683087f3262SPaul Mullowney PetscFunctionBegin; 684087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 685087f3262SPaul Mullowney 686087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 687087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 688087f3262SPaul Mullowney if (perm_identity) { 689087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 690087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 691087f3262SPaul Mullowney } else { 692087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 693087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 694087f3262SPaul Mullowney } 695087f3262SPaul Mullowney 696087f3262SPaul Mullowney /* get the triangular factors */ 697087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 698087f3262SPaul Mullowney PetscFunctionReturn(0); 699087f3262SPaul Mullowney } 7009ae82921SPaul Mullowney 701bda325fcSPaul Mullowney #undef __FUNCT__ 702bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 703b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 704bda325fcSPaul Mullowney { 705bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 706*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 707*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 708*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 709*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 710bda325fcSPaul Mullowney cusparseStatus_t stat; 711*aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 712*aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 713*aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 714*aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 715b175d8bbSPaul Mullowney 716bda325fcSPaul Mullowney PetscFunctionBegin; 717bda325fcSPaul Mullowney 718*aa372e3fSPaul Mullowney /*********************************************/ 719*aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 720*aa372e3fSPaul Mullowney /*********************************************/ 721*aa372e3fSPaul Mullowney 722*aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 723*aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 724*aa372e3fSPaul Mullowney 725*aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 726*aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 727*aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 728*aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 729*aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 730*aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 731*aa372e3fSPaul Mullowney 732*aa372e3fSPaul Mullowney /* Create the matrix description */ 733*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 734*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 735*aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 736*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 737*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 738*aa372e3fSPaul Mullowney 739*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 740*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 741*aa372e3fSPaul Mullowney 742*aa372e3fSPaul Mullowney /* set the operation */ 743*aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 744*aa372e3fSPaul Mullowney 745*aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 746*aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 747*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 748*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 749*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 750*aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 751*aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 752*aa372e3fSPaul Mullowney loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 753*aa372e3fSPaul Mullowney 754*aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 755*aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 756*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 757*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 758*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 759*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 760*aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 761*aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 762*aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 763*aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 764*aa372e3fSPaul Mullowney 765*aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 766*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 767*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 768*aa372e3fSPaul Mullowney loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 769*aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 770*aa372e3fSPaul Mullowney loTriFactorT->solveInfo);CHKERRCUSP(stat); 771*aa372e3fSPaul Mullowney 772*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 773*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 774*aa372e3fSPaul Mullowney 775*aa372e3fSPaul Mullowney /*********************************************/ 776*aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 777*aa372e3fSPaul Mullowney /*********************************************/ 778*aa372e3fSPaul Mullowney 779*aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 780*aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 781*aa372e3fSPaul Mullowney 782*aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 783*aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 784*aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 785*aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 786*aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 787*aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 788*aa372e3fSPaul Mullowney 789*aa372e3fSPaul Mullowney /* Create the matrix description */ 790*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 791*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 792*aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 793*aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 794*aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 795*aa372e3fSPaul Mullowney 796*aa372e3fSPaul Mullowney /* Create the solve analysis information */ 797*aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 798*aa372e3fSPaul Mullowney 799*aa372e3fSPaul Mullowney /* set the operation */ 800*aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 801*aa372e3fSPaul Mullowney 802*aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 803*aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 804*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 805*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 806*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 807*aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 808*aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 809*aa372e3fSPaul Mullowney upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 810*aa372e3fSPaul Mullowney 811*aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 812*aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 813*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 814*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 815*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 816*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 817*aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 818*aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 819*aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 820*aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 821*aa372e3fSPaul Mullowney 822*aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 823*aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 824*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 825*aa372e3fSPaul Mullowney upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 826*aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 827*aa372e3fSPaul Mullowney upTriFactorT->solveInfo);CHKERRCUSP(stat); 828*aa372e3fSPaul Mullowney 829*aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 830*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 831bda325fcSPaul Mullowney PetscFunctionReturn(0); 832bda325fcSPaul Mullowney } 833bda325fcSPaul Mullowney 834bda325fcSPaul Mullowney #undef __FUNCT__ 835bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 836b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 837bda325fcSPaul Mullowney { 838*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 839*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 840*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 841bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 842bda325fcSPaul Mullowney cusparseStatus_t stat; 843*aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 844b175d8bbSPaul Mullowney 845bda325fcSPaul Mullowney PetscFunctionBegin; 846*aa372e3fSPaul Mullowney 847*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 848*aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 849*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 850*aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 851*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 852*aa372e3fSPaul Mullowney stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 853*aa372e3fSPaul Mullowney 854*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 855*aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 856*aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 857*aa372e3fSPaul Mullowney matrixT->num_rows = A->rmap->n; 858*aa372e3fSPaul Mullowney matrixT->num_cols = A->cmap->n; 859*aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 860*aa372e3fSPaul Mullowney matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 861*aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 862*aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 863*aa372e3fSPaul Mullowney 864*aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 865*aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 866*aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 867*aa372e3fSPaul Mullowney matrix->num_cols, matrix->num_entries, 868*aa372e3fSPaul Mullowney matrix->values->data().get(), 869*aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 870*aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 871*aa372e3fSPaul Mullowney matrixT->values->data().get(), 872*aa372e3fSPaul Mullowney matrixT->column_indices->data().get(), 873*aa372e3fSPaul Mullowney matrixT->row_offsets->data().get(), 874*aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 875*aa372e3fSPaul Mullowney 876*aa372e3fSPaul Mullowney /* assign the pointer */ 877*aa372e3fSPaul Mullowney matstructT->mat = matrixT; 878*aa372e3fSPaul Mullowney 879*aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 880*aa372e3fSPaul Mullowney 881*aa372e3fSPaul Mullowney /* First convert HYB to CSR */ 882*aa372e3fSPaul Mullowney CsrMatrix *temp= new CsrMatrix; 883*aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 884*aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 885*aa372e3fSPaul Mullowney temp->num_entries = a->nz; 886*aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 887*aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 888*aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 889*aa372e3fSPaul Mullowney 890*aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 891*aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 892*aa372e3fSPaul Mullowney temp->values->data().get(), 893*aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 894*aa372e3fSPaul Mullowney temp->column_indices->data().get());CHKERRCUSP(stat); 895*aa372e3fSPaul Mullowney 896*aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 897*aa372e3fSPaul Mullowney CsrMatrix *tempT= new CsrMatrix; 898*aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 899*aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 900*aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 901*aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 902*aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 903*aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 904*aa372e3fSPaul Mullowney 905*aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 906*aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 907*aa372e3fSPaul Mullowney temp->values->data().get(), 908*aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 909*aa372e3fSPaul Mullowney temp->column_indices->data().get(), 910*aa372e3fSPaul Mullowney tempT->values->data().get(), 911*aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 912*aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 913*aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 914*aa372e3fSPaul Mullowney 915*aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 916*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 917*aa372e3fSPaul Mullowney stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 918*aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 919*aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 920*aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 921*aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 922*aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 923*aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 924*aa372e3fSPaul Mullowney hybMat, 0, partition);CHKERRCUSP(stat); 925*aa372e3fSPaul Mullowney 926*aa372e3fSPaul Mullowney /* assign the pointer */ 927*aa372e3fSPaul Mullowney matstructT->mat = hybMat; 928*aa372e3fSPaul Mullowney 929*aa372e3fSPaul Mullowney /* delete temporaries */ 930*aa372e3fSPaul Mullowney if (tempT) { 931*aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 932*aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 933*aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 934*aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 935087f3262SPaul Mullowney } 936*aa372e3fSPaul Mullowney if (temp) { 937*aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 938*aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 939*aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 940*aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 941*aa372e3fSPaul Mullowney } 942*aa372e3fSPaul Mullowney } 943*aa372e3fSPaul Mullowney /* assign the compressed row indices */ 944*aa372e3fSPaul Mullowney matstructT->cprowIndices = new THRUSTINTARRAY; 945*aa372e3fSPaul Mullowney 946*aa372e3fSPaul Mullowney /* assign the pointer */ 947*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 948bda325fcSPaul Mullowney PetscFunctionReturn(0); 949bda325fcSPaul Mullowney } 950bda325fcSPaul Mullowney 951bda325fcSPaul Mullowney #undef __FUNCT__ 952bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 9536fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 954bda325fcSPaul Mullowney { 955bda325fcSPaul Mullowney CUSPARRAY *xGPU, *bGPU; 956bda325fcSPaul Mullowney cusparseStatus_t stat; 957bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 958*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 959*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 960*aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 961b175d8bbSPaul Mullowney PetscErrorCode ierr; 962bda325fcSPaul Mullowney 963bda325fcSPaul Mullowney PetscFunctionBegin; 964*aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 965*aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 966bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 967*aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 968*aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 969bda325fcSPaul Mullowney } 970bda325fcSPaul Mullowney 971bda325fcSPaul Mullowney /* Get the GPU pointers */ 972bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 973bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 974bda325fcSPaul Mullowney 975*aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 976*aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 977*aa372e3fSPaul Mullowney thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 978*aa372e3fSPaul Mullowney xGPU->begin()); 979*aa372e3fSPaul Mullowney 980*aa372e3fSPaul Mullowney /* First, solve U */ 981*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 982*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 983*aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 984*aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 985*aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 986*aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 987*aa372e3fSPaul Mullowney xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 988*aa372e3fSPaul Mullowney 989*aa372e3fSPaul Mullowney /* Then, solve L */ 990*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 991*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 992*aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 993*aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 994*aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 995*aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 996*aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 997*aa372e3fSPaul Mullowney 998*aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 999*aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1000*aa372e3fSPaul Mullowney thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1001*aa372e3fSPaul Mullowney tempGPU->begin()); 1002*aa372e3fSPaul Mullowney 1003*aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1004*aa372e3fSPaul Mullowney thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1005bda325fcSPaul Mullowney 1006bda325fcSPaul Mullowney /* restore */ 1007bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1008bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1009bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1010087f3262SPaul Mullowney 1011*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1012bda325fcSPaul Mullowney PetscFunctionReturn(0); 1013bda325fcSPaul Mullowney } 1014bda325fcSPaul Mullowney 1015bda325fcSPaul Mullowney #undef __FUNCT__ 1016bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 10176fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1018bda325fcSPaul Mullowney { 1019bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 1020bda325fcSPaul Mullowney cusparseStatus_t stat; 1021bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1022*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1023*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1024*aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1025b175d8bbSPaul Mullowney PetscErrorCode ierr; 1026bda325fcSPaul Mullowney 1027bda325fcSPaul Mullowney PetscFunctionBegin; 1028*aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1029*aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1030bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1031*aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1032*aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1033bda325fcSPaul Mullowney } 1034bda325fcSPaul Mullowney 1035bda325fcSPaul Mullowney /* Get the GPU pointers */ 1036bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1037bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1038bda325fcSPaul Mullowney 1039*aa372e3fSPaul Mullowney /* First, solve U */ 1040*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1041*aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1042*aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1043*aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1044*aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1045*aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1046*aa372e3fSPaul Mullowney bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1047*aa372e3fSPaul Mullowney 1048*aa372e3fSPaul Mullowney /* Then, solve L */ 1049*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1050*aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1051*aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1052*aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1053*aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1054*aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1055*aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1056bda325fcSPaul Mullowney 1057bda325fcSPaul Mullowney /* restore */ 1058bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1059bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1060bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1061*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1062bda325fcSPaul Mullowney PetscFunctionReturn(0); 1063bda325fcSPaul Mullowney } 1064bda325fcSPaul Mullowney 10659ae82921SPaul Mullowney #undef __FUNCT__ 10669ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 10676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 10689ae82921SPaul Mullowney { 1069bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 10709ae82921SPaul Mullowney cusparseStatus_t stat; 10719ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1072*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1073*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1074*aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1075b175d8bbSPaul Mullowney PetscErrorCode ierr; 10769ae82921SPaul Mullowney 10779ae82921SPaul Mullowney PetscFunctionBegin; 1078e057df02SPaul Mullowney /* Get the GPU pointers */ 10799ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 10809ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 10819ae82921SPaul Mullowney 1082*aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1083*aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1084*aa372e3fSPaul Mullowney thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1085*aa372e3fSPaul Mullowney xGPU->begin()); 1086*aa372e3fSPaul Mullowney 1087*aa372e3fSPaul Mullowney /* Next, solve L */ 1088*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1089*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1090*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1091*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1092*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1093*aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1094*aa372e3fSPaul Mullowney xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1095*aa372e3fSPaul Mullowney 1096*aa372e3fSPaul Mullowney /* Then, solve U */ 1097*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1098*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1099*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1100*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1101*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1102*aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1103*aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1104*aa372e3fSPaul Mullowney 1105*aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1106*aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1107*aa372e3fSPaul Mullowney thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1108*aa372e3fSPaul Mullowney tempGPU->begin()); 1109*aa372e3fSPaul Mullowney 1110*aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1111*aa372e3fSPaul Mullowney thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 11129ae82921SPaul Mullowney 11139ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 11149ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11159ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1116*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 11179ae82921SPaul Mullowney PetscFunctionReturn(0); 11189ae82921SPaul Mullowney } 11199ae82921SPaul Mullowney 11209ae82921SPaul Mullowney #undef __FUNCT__ 11219ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 11226fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 11239ae82921SPaul Mullowney { 1124bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 11259ae82921SPaul Mullowney cusparseStatus_t stat; 11269ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1127*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1128*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1129*aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1130b175d8bbSPaul Mullowney PetscErrorCode ierr; 11319ae82921SPaul Mullowney 11329ae82921SPaul Mullowney PetscFunctionBegin; 1133e057df02SPaul Mullowney /* Get the GPU pointers */ 11349ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11359ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 11369ae82921SPaul Mullowney 1137*aa372e3fSPaul Mullowney /* First, solve L */ 1138*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1139*aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1140*aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1141*aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1142*aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1143*aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1144*aa372e3fSPaul Mullowney bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1145*aa372e3fSPaul Mullowney 1146*aa372e3fSPaul Mullowney /* Next, solve U */ 1147*aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1148*aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1149*aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1150*aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1151*aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1152*aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1153*aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 11549ae82921SPaul Mullowney 11559ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 11569ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11579ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1158*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 11599ae82921SPaul Mullowney PetscFunctionReturn(0); 11609ae82921SPaul Mullowney } 11619ae82921SPaul Mullowney 11629ae82921SPaul Mullowney #undef __FUNCT__ 1163e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 11646fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 11659ae82921SPaul Mullowney { 11669ae82921SPaul Mullowney 1167*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1168*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 11699ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11709ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 11719ae82921SPaul Mullowney PetscErrorCode ierr; 1172*aa372e3fSPaul Mullowney cusparseStatus_t stat; 11739ae82921SPaul Mullowney 11749ae82921SPaul Mullowney PetscFunctionBegin; 11759ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 11769ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1177*aa372e3fSPaul Mullowney if (matstruct) { 1178*aa372e3fSPaul Mullowney SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 11799ae82921SPaul Mullowney } 11809ae82921SPaul Mullowney try { 1181*aa372e3fSPaul Mullowney cusparsestruct->nonzerorow=0; 1182*aa372e3fSPaul Mullowney for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 11839ae82921SPaul Mullowney 11849ae82921SPaul Mullowney if (a->compressedrow.use) { 11859ae82921SPaul Mullowney m = a->compressedrow.nrows; 11869ae82921SPaul Mullowney ii = a->compressedrow.i; 11879ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 11889ae82921SPaul Mullowney } else { 1189e057df02SPaul Mullowney /* Forcing compressed row on the GPU ... only relevant for CSR storage */ 11909ae82921SPaul Mullowney int k=0; 1191*aa372e3fSPaul Mullowney ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 1192*aa372e3fSPaul Mullowney ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 11939ae82921SPaul Mullowney ii[0]=0; 11949ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 11959ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 11969ae82921SPaul Mullowney ii[k] = a->i[j]; 11979ae82921SPaul Mullowney ridx[k]= j; 11989ae82921SPaul Mullowney k++; 11999ae82921SPaul Mullowney } 12009ae82921SPaul Mullowney } 1201*aa372e3fSPaul Mullowney ii[cusparsestruct->nonzerorow] = a->nz; 12022205254eSKarl Rupp 1203*aa372e3fSPaul Mullowney m = cusparsestruct->nonzerorow; 12049ae82921SPaul Mullowney } 12059ae82921SPaul Mullowney 1206*aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 1207*aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1208*aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1209*aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1210*aa372e3fSPaul Mullowney stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 12119ae82921SPaul Mullowney 1212*aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1213*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1214*aa372e3fSPaul Mullowney /* set the matrix */ 1215*aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1216*aa372e3fSPaul Mullowney matrix->num_rows = A->rmap->n; 1217*aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1218*aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1219*aa372e3fSPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1220*aa372e3fSPaul Mullowney matrix->row_offsets->assign(ii, ii + A->rmap->n+1); 12219ae82921SPaul Mullowney 1222*aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1223*aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1224*aa372e3fSPaul Mullowney 1225*aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1226*aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1227*aa372e3fSPaul Mullowney 1228*aa372e3fSPaul Mullowney /* assign the pointer */ 1229*aa372e3fSPaul Mullowney matstruct->mat = matrix; 1230*aa372e3fSPaul Mullowney 1231*aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1232*aa372e3fSPaul Mullowney 1233*aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1234*aa372e3fSPaul Mullowney matrix->num_rows = A->rmap->n; 1235*aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1236*aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1237*aa372e3fSPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1238*aa372e3fSPaul Mullowney matrix->row_offsets->assign(ii, ii + A->rmap->n+1); 1239*aa372e3fSPaul Mullowney 1240*aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1241*aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1242*aa372e3fSPaul Mullowney 1243*aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1244*aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1245*aa372e3fSPaul Mullowney 1246*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 1247*aa372e3fSPaul Mullowney stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1248*aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1249*aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1250*aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1251*aa372e3fSPaul Mullowney matstruct->descr, matrix->values->data().get(), 1252*aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 1253*aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1254*aa372e3fSPaul Mullowney hybMat, 0, partition);CHKERRCUSP(stat); 1255*aa372e3fSPaul Mullowney /* assign the pointer */ 1256*aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1257*aa372e3fSPaul Mullowney 1258*aa372e3fSPaul Mullowney if (matrix) { 1259*aa372e3fSPaul Mullowney if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1260*aa372e3fSPaul Mullowney if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1261*aa372e3fSPaul Mullowney if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1262*aa372e3fSPaul Mullowney delete (CsrMatrix*)matrix; 1263087f3262SPaul Mullowney } 1264087f3262SPaul Mullowney } 1265ca45077fSPaul Mullowney 1266*aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1267*aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1268*aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1269*aa372e3fSPaul Mullowney 1270*aa372e3fSPaul Mullowney /* assign the pointer */ 1271*aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 1272*aa372e3fSPaul Mullowney 12739ae82921SPaul Mullowney if (!a->compressedrow.use) { 12749ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 12759ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 12769ae82921SPaul Mullowney } 1277*aa372e3fSPaul Mullowney cusparsestruct->workVector = new THRUSTARRAY; 1278*aa372e3fSPaul Mullowney cusparsestruct->workVector->resize(m); 12799ae82921SPaul Mullowney } catch(char *ex) { 12809ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 12819ae82921SPaul Mullowney } 12829ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 12832205254eSKarl Rupp 12849ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 12852205254eSKarl Rupp 12869ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 12879ae82921SPaul Mullowney } 12889ae82921SPaul Mullowney PetscFunctionReturn(0); 12899ae82921SPaul Mullowney } 12909ae82921SPaul Mullowney 12919ae82921SPaul Mullowney #undef __FUNCT__ 12929ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 12936fa9248bSJed Brown static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 12949ae82921SPaul Mullowney { 12959ae82921SPaul Mullowney PetscErrorCode ierr; 12969ae82921SPaul Mullowney 12979ae82921SPaul Mullowney PetscFunctionBegin; 12989ae82921SPaul Mullowney if (right) { 1299ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 13009ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 13019ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 13029ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 13039ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 13049ae82921SPaul Mullowney } 13059ae82921SPaul Mullowney if (left) { 1306ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 13079ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 13089ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 13099ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 13109ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 13119ae82921SPaul Mullowney } 13129ae82921SPaul Mullowney PetscFunctionReturn(0); 13139ae82921SPaul Mullowney } 13149ae82921SPaul Mullowney 1315*aa372e3fSPaul Mullowney struct VecCUSPPlusEquals 1316*aa372e3fSPaul Mullowney { 1317*aa372e3fSPaul Mullowney template <typename Tuple> 1318*aa372e3fSPaul Mullowney __host__ __device__ 1319*aa372e3fSPaul Mullowney void operator()(Tuple t) 1320*aa372e3fSPaul Mullowney { 1321*aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1322*aa372e3fSPaul Mullowney } 1323*aa372e3fSPaul Mullowney }; 1324*aa372e3fSPaul Mullowney 13259ae82921SPaul Mullowney #undef __FUNCT__ 13269ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 13276fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 13289ae82921SPaul Mullowney { 13299ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1330*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1331*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1332bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 1333b175d8bbSPaul Mullowney PetscErrorCode ierr; 1334*aa372e3fSPaul Mullowney cusparseStatus_t stat; 13359ae82921SPaul Mullowney 13369ae82921SPaul Mullowney PetscFunctionBegin; 1337e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1338e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 13399ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 13409ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1341*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1342*aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1343*aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1344*aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, mat->num_entries, 1345*aa372e3fSPaul Mullowney &ALPHA, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1346*aa372e3fSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), &BETA, 1347*aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 1348*aa372e3fSPaul Mullowney } else { 1349*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1350*aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1351*aa372e3fSPaul Mullowney &ALPHA, matstruct->descr, hybMat, 1352*aa372e3fSPaul Mullowney xarray->data().get(), &BETA, 1353*aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 13549ae82921SPaul Mullowney } 13559ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 13569ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1357*aa372e3fSPaul Mullowney if (!cusparsestruct->stream) { 13589ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1359ca45077fSPaul Mullowney } 1360*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 13619ae82921SPaul Mullowney PetscFunctionReturn(0); 13629ae82921SPaul Mullowney } 13639ae82921SPaul Mullowney 13649ae82921SPaul Mullowney #undef __FUNCT__ 1365ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 13666fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1367ca45077fSPaul Mullowney { 1368ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1369*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1370*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1371bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 1372b175d8bbSPaul Mullowney PetscErrorCode ierr; 1373*aa372e3fSPaul Mullowney cusparseStatus_t stat; 1374ca45077fSPaul Mullowney 1375ca45077fSPaul Mullowney PetscFunctionBegin; 1376e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1377e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1378*aa372e3fSPaul Mullowney if (!matstructT) { 1379bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1380*aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1381bda325fcSPaul Mullowney } 1382ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1383ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1384*aa372e3fSPaul Mullowney 1385*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1386*aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1387*aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1388*aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, 1389*aa372e3fSPaul Mullowney mat->num_entries, &ALPHA, matstructT->descr, 1390*aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1391*aa372e3fSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), &BETA, 1392*aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 1393*aa372e3fSPaul Mullowney } else { 1394*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1395*aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1396*aa372e3fSPaul Mullowney &ALPHA, matstructT->descr, hybMat, 1397*aa372e3fSPaul Mullowney xarray->data().get(), &BETA, 1398*aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 1399ca45077fSPaul Mullowney } 1400ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1401ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402*aa372e3fSPaul Mullowney if (!cusparsestruct->stream) { 1403ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1404ca45077fSPaul Mullowney } 1405*aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1406ca45077fSPaul Mullowney PetscFunctionReturn(0); 1407ca45077fSPaul Mullowney } 1408ca45077fSPaul Mullowney 1409*aa372e3fSPaul Mullowney 1410ca45077fSPaul Mullowney #undef __FUNCT__ 14119ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 14126fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 14139ae82921SPaul Mullowney { 14149ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1415*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1416*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1417bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 1418b175d8bbSPaul Mullowney PetscErrorCode ierr; 1419*aa372e3fSPaul Mullowney cusparseStatus_t stat; 14206e111a19SKarl Rupp 14219ae82921SPaul Mullowney PetscFunctionBegin; 1422e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1423e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 14249ae82921SPaul Mullowney try { 14259ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 14269ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 14279ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 14289ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 14299ae82921SPaul Mullowney 1430e057df02SPaul Mullowney /* multiply add */ 1431*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1432*aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1433*aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1434*aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, 1435*aa372e3fSPaul Mullowney mat->num_entries, &ALPHA, matstruct->descr, 1436*aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1437*aa372e3fSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), &BETA, 1438*aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1439*aa372e3fSPaul Mullowney } else { 1440*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1441*aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1442*aa372e3fSPaul Mullowney &ALPHA, matstruct->descr, hybMat, 1443*aa372e3fSPaul Mullowney xarray->data().get(), &BETA, 1444*aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1445*aa372e3fSPaul Mullowney } 1446*aa372e3fSPaul Mullowney 1447*aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 1448*aa372e3fSPaul Mullowney thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1449*aa372e3fSPaul Mullowney thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1450*aa372e3fSPaul Mullowney VecCUSPPlusEquals()); 14519ae82921SPaul Mullowney 14529ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 14539ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 14549ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 14559ae82921SPaul Mullowney 14569ae82921SPaul Mullowney } catch(char *ex) { 14579ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 14589ae82921SPaul Mullowney } 14599ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 14609ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 14619ae82921SPaul Mullowney PetscFunctionReturn(0); 14629ae82921SPaul Mullowney } 14639ae82921SPaul Mullowney 14649ae82921SPaul Mullowney #undef __FUNCT__ 1465b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 14666fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1467ca45077fSPaul Mullowney { 1468ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1469*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1470*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1471ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 1472b175d8bbSPaul Mullowney PetscErrorCode ierr; 1473*aa372e3fSPaul Mullowney cusparseStatus_t stat; 14746e111a19SKarl Rupp 1475ca45077fSPaul Mullowney PetscFunctionBegin; 1476e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1477e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1478*aa372e3fSPaul Mullowney if (!matstructT) { 1479bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1480*aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1481bda325fcSPaul Mullowney } 1482*aa372e3fSPaul Mullowney 1483ca45077fSPaul Mullowney try { 1484ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1485ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1486ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1487ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1488ca45077fSPaul Mullowney 1489e057df02SPaul Mullowney /* multiply add with matrix transpose */ 1490*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1491*aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1492*aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1493*aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, mat->num_entries, &ALPHA, matstructT->descr, 1494*aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1495*aa372e3fSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), &BETA, 1496*aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1497*aa372e3fSPaul Mullowney } else { 1498*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1499*aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1500*aa372e3fSPaul Mullowney &ALPHA, matstructT->descr, hybMat, 1501*aa372e3fSPaul Mullowney xarray->data().get(), &BETA, 1502*aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1503*aa372e3fSPaul Mullowney } 1504*aa372e3fSPaul Mullowney 1505*aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 1506*aa372e3fSPaul Mullowney thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1507*aa372e3fSPaul Mullowney thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1508*aa372e3fSPaul Mullowney VecCUSPPlusEquals()); 1509ca45077fSPaul Mullowney 1510ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1511ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1512ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1513ca45077fSPaul Mullowney 1514ca45077fSPaul Mullowney } catch(char *ex) { 1515ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1516ca45077fSPaul Mullowney } 1517ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1518ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1519ca45077fSPaul Mullowney PetscFunctionReturn(0); 1520ca45077fSPaul Mullowney } 1521ca45077fSPaul Mullowney 1522ca45077fSPaul Mullowney #undef __FUNCT__ 15239ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 15246fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 15259ae82921SPaul Mullowney { 15269ae82921SPaul Mullowney PetscErrorCode ierr; 15276e111a19SKarl Rupp 15289ae82921SPaul Mullowney PetscFunctionBegin; 15299ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1530bc3f50f2SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 1531e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1532bc3f50f2SPaul Mullowney } 15339ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1534bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 1535bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1536bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1537bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 15389ae82921SPaul Mullowney PetscFunctionReturn(0); 15399ae82921SPaul Mullowney } 15409ae82921SPaul Mullowney 15419ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 15429ae82921SPaul Mullowney #undef __FUNCT__ 15439ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1544e057df02SPaul Mullowney /*@ 15459ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1546e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 1547e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1548e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1549e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1550e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 15519ae82921SPaul Mullowney 15529ae82921SPaul Mullowney Collective on MPI_Comm 15539ae82921SPaul Mullowney 15549ae82921SPaul Mullowney Input Parameters: 15559ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 15569ae82921SPaul Mullowney . m - number of rows 15579ae82921SPaul Mullowney . n - number of columns 15589ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 15599ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 15600298fd71SBarry Smith (possibly different for each row) or NULL 15619ae82921SPaul Mullowney 15629ae82921SPaul Mullowney Output Parameter: 15639ae82921SPaul Mullowney . A - the matrix 15649ae82921SPaul Mullowney 15659ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 15669ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 15679ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 15689ae82921SPaul Mullowney 15699ae82921SPaul Mullowney Notes: 15709ae82921SPaul Mullowney If nnz is given then nz is ignored 15719ae82921SPaul Mullowney 15729ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 15739ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 15749ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 15759ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 15769ae82921SPaul Mullowney 15779ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 15780298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 15799ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 15809ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 15819ae82921SPaul Mullowney 15829ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 15839ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 15849ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 15859ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 15869ae82921SPaul Mullowney 15879ae82921SPaul Mullowney Level: intermediate 15889ae82921SPaul Mullowney 1589e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 15909ae82921SPaul Mullowney @*/ 15919ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 15929ae82921SPaul Mullowney { 15939ae82921SPaul Mullowney PetscErrorCode ierr; 15949ae82921SPaul Mullowney 15959ae82921SPaul Mullowney PetscFunctionBegin; 15969ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 15979ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 15989ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 15999ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 16009ae82921SPaul Mullowney PetscFunctionReturn(0); 16019ae82921SPaul Mullowney } 16029ae82921SPaul Mullowney 16039ae82921SPaul Mullowney #undef __FUNCT__ 16049ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 16056fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 16069ae82921SPaul Mullowney { 16079ae82921SPaul Mullowney PetscErrorCode ierr; 1608*aa372e3fSPaul Mullowney cusparseStatus_t stat; 16099ae82921SPaul Mullowney 16109ae82921SPaul Mullowney PetscFunctionBegin; 16119ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 16129ae82921SPaul Mullowney try { 16139ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1614*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1615*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1616*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1617*aa372e3fSPaul Mullowney 1618*aa372e3fSPaul Mullowney /* delete all the members in each of the data structures */ 1619*aa372e3fSPaul Mullowney if (matstruct) { 1620*aa372e3fSPaul Mullowney if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1621*aa372e3fSPaul Mullowney if (matstruct->mat) { 1622*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1623*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1624*aa372e3fSPaul Mullowney stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1625*aa372e3fSPaul Mullowney } else { 1626*aa372e3fSPaul Mullowney CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1627*aa372e3fSPaul Mullowney if (mat->values) delete (THRUSTARRAY*)mat->values; 1628*aa372e3fSPaul Mullowney if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1629*aa372e3fSPaul Mullowney if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1630*aa372e3fSPaul Mullowney delete (CsrMatrix*)matstruct->mat; 16319ae82921SPaul Mullowney } 1632*aa372e3fSPaul Mullowney } 1633*aa372e3fSPaul Mullowney if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1634*aa372e3fSPaul Mullowney if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1635*aa372e3fSPaul Mullowney } 1636*aa372e3fSPaul Mullowney if (matstructT) { 1637*aa372e3fSPaul Mullowney if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1638*aa372e3fSPaul Mullowney if (matstructT->mat) { 1639*aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1640*aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1641*aa372e3fSPaul Mullowney stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1642*aa372e3fSPaul Mullowney } else { 1643*aa372e3fSPaul Mullowney CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1644*aa372e3fSPaul Mullowney if (matT->values) delete (THRUSTARRAY*)matT->values; 1645*aa372e3fSPaul Mullowney if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1646*aa372e3fSPaul Mullowney if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1647*aa372e3fSPaul Mullowney delete (CsrMatrix*)matstructT->mat; 1648*aa372e3fSPaul Mullowney } 1649*aa372e3fSPaul Mullowney } 1650*aa372e3fSPaul Mullowney if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1651*aa372e3fSPaul Mullowney if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1652*aa372e3fSPaul Mullowney } 1653*aa372e3fSPaul Mullowney if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1654*aa372e3fSPaul Mullowney if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1655*aa372e3fSPaul Mullowney if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 16569ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1657*aa372e3fSPaul Mullowney } else { 1658*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1659*aa372e3fSPaul Mullowney if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1660*aa372e3fSPaul Mullowney if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1661*aa372e3fSPaul Mullowney } 16629ae82921SPaul Mullowney } catch(char *ex) { 16639ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 16649ae82921SPaul Mullowney } 16659ae82921SPaul Mullowney } else { 1666e057df02SPaul Mullowney /* The triangular factors */ 16679ae82921SPaul Mullowney try { 16689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1669*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1670*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1671*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1672*aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1673*aa372e3fSPaul Mullowney 1674*aa372e3fSPaul Mullowney /* delete all the members in each of the data structures */ 1675*aa372e3fSPaul Mullowney if (loTriFactor) { 1676*aa372e3fSPaul Mullowney if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1677*aa372e3fSPaul Mullowney if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1678*aa372e3fSPaul Mullowney if (loTriFactor->csrMat) { 1679*aa372e3fSPaul Mullowney if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1680*aa372e3fSPaul Mullowney if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1681*aa372e3fSPaul Mullowney if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1682*aa372e3fSPaul Mullowney delete (CsrMatrix*)loTriFactor->csrMat; 1683*aa372e3fSPaul Mullowney } 1684*aa372e3fSPaul Mullowney if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1685*aa372e3fSPaul Mullowney } 1686*aa372e3fSPaul Mullowney 1687*aa372e3fSPaul Mullowney if (upTriFactor) { 1688*aa372e3fSPaul Mullowney if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1689*aa372e3fSPaul Mullowney if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1690*aa372e3fSPaul Mullowney if (upTriFactor->csrMat) { 1691*aa372e3fSPaul Mullowney if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1692*aa372e3fSPaul Mullowney if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1693*aa372e3fSPaul Mullowney if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1694*aa372e3fSPaul Mullowney delete (CsrMatrix*)upTriFactor->csrMat; 1695*aa372e3fSPaul Mullowney } 1696*aa372e3fSPaul Mullowney if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1697*aa372e3fSPaul Mullowney } 1698*aa372e3fSPaul Mullowney 1699*aa372e3fSPaul Mullowney if (loTriFactorT) { 1700*aa372e3fSPaul Mullowney if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1701*aa372e3fSPaul Mullowney if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1702*aa372e3fSPaul Mullowney if (loTriFactorT->csrMat) { 1703*aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1704*aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1705*aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1706*aa372e3fSPaul Mullowney delete (CsrMatrix*)loTriFactorT->csrMat; 1707*aa372e3fSPaul Mullowney } 1708*aa372e3fSPaul Mullowney if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1709*aa372e3fSPaul Mullowney } 1710*aa372e3fSPaul Mullowney 1711*aa372e3fSPaul Mullowney if (upTriFactorT) { 1712*aa372e3fSPaul Mullowney if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1713*aa372e3fSPaul Mullowney if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1714*aa372e3fSPaul Mullowney if (upTriFactorT->csrMat) { 1715*aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1716*aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1717*aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1718*aa372e3fSPaul Mullowney delete (CsrMatrix*)upTriFactorT->csrMat; 1719*aa372e3fSPaul Mullowney } 1720*aa372e3fSPaul Mullowney if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1721*aa372e3fSPaul Mullowney } 1722*aa372e3fSPaul Mullowney if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1723*aa372e3fSPaul Mullowney if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1724*aa372e3fSPaul Mullowney if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1725*aa372e3fSPaul Mullowney if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1726*aa372e3fSPaul Mullowney if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 17279ae82921SPaul Mullowney } catch(char *ex) { 17289ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 17299ae82921SPaul Mullowney } 17309ae82921SPaul Mullowney } 17319ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 17329ae82921SPaul Mullowney A->spptr = 0; 17339ae82921SPaul Mullowney 17349ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 17359ae82921SPaul Mullowney PetscFunctionReturn(0); 17369ae82921SPaul Mullowney } 17379ae82921SPaul Mullowney 17389ae82921SPaul Mullowney #undef __FUNCT__ 17399ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 17408cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 17419ae82921SPaul Mullowney { 17429ae82921SPaul Mullowney PetscErrorCode ierr; 1743*aa372e3fSPaul Mullowney cusparseStatus_t stat; 1744*aa372e3fSPaul Mullowney cusparseHandle_t handle=0; 17459ae82921SPaul Mullowney 17469ae82921SPaul Mullowney PetscFunctionBegin; 17479ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 17489ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 1749e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 1750e057df02SPaul Mullowney now build a GPU matrix data structure */ 17519ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 17529ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1753*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1754*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1755e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1756*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1757*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1758*aa372e3fSPaul Mullowney stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1759*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1760*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 17619ae82921SPaul Mullowney } else { 17629ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 1763debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 17649ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 17659ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1766*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1767*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1768*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1769*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1770*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1771e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format = cusparseMatSolveStorageFormat; 1772*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1773*aa372e3fSPaul Mullowney stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1774*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1775*aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 17769ae82921SPaul Mullowney } 1777*aa372e3fSPaul Mullowney 1778e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1779e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 1780e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1781bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 17822205254eSKarl Rupp 17839ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 17849ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 17859ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 17869ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1787ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 1788ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1789ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1790ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 17912205254eSKarl Rupp 17929ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 17932205254eSKarl Rupp 17949ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 17952205254eSKarl Rupp 1796bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 17979ae82921SPaul Mullowney PetscFunctionReturn(0); 17989ae82921SPaul Mullowney } 17999ae82921SPaul Mullowney 1800e057df02SPaul Mullowney /*M 1801e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1802e057df02SPaul Mullowney 1803e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1804e057df02SPaul Mullowney CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using 1805*aa372e3fSPaul Mullowney the CUSPARSE library. 1806e057df02SPaul Mullowney 1807e057df02SPaul Mullowney Options Database Keys: 1808e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1809*aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1810*aa372e3fSPaul Mullowney . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1811*aa372e3fSPaul Mullowney - -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1812e057df02SPaul Mullowney 1813e057df02SPaul Mullowney Level: beginner 1814e057df02SPaul Mullowney 18158468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1816e057df02SPaul Mullowney M*/ 1817