19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3bc3f50f2SPaul Mullowney matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 63d13b8fdSMatthew G. Knepley #include <petscconf.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 93d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 103d13b8fdSMatthew G. Knepley #include <petsc-private/vecimpl.h> 119ae82921SPaul Mullowney #undef VecType 123d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 13bc3f50f2SPaul Mullowney 14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 159ae82921SPaul Mullowney 16087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 17087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 19087f3262SPaul Mullowney 206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 216fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 226fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 23087f3262SPaul Mullowney 246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 286fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 296fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 306fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 316fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 326fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 339ae82921SPaul Mullowney 34*7f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 35*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 36*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 37*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 38*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 39*7f756511SDominic Meiser 409ae82921SPaul Mullowney #undef __FUNCT__ 41b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetStream" 42b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43b06137fdSPaul Mullowney { 44b06137fdSPaul Mullowney cusparseStatus_t stat; 45b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46b06137fdSPaul Mullowney 47b06137fdSPaul Mullowney PetscFunctionBegin; 48b06137fdSPaul Mullowney cusparsestruct->stream = stream; 49b06137fdSPaul Mullowney stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat); 50b06137fdSPaul Mullowney PetscFunctionReturn(0); 51b06137fdSPaul Mullowney } 52b06137fdSPaul Mullowney 53b06137fdSPaul Mullowney #undef __FUNCT__ 54b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetHandle" 55b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 56b06137fdSPaul Mullowney { 57b06137fdSPaul Mullowney cusparseStatus_t stat; 58b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 59b06137fdSPaul Mullowney 60b06137fdSPaul Mullowney PetscFunctionBegin; 61b06137fdSPaul Mullowney if (cusparsestruct->handle) 62b06137fdSPaul Mullowney stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); 63b06137fdSPaul Mullowney cusparsestruct->handle = handle; 64b06137fdSPaul Mullowney stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 65b06137fdSPaul Mullowney PetscFunctionReturn(0); 66b06137fdSPaul Mullowney } 67b06137fdSPaul Mullowney 68b06137fdSPaul Mullowney #undef __FUNCT__ 69b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSEClearHandle" 70b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 71b06137fdSPaul Mullowney { 72b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 73b06137fdSPaul Mullowney PetscFunctionBegin; 74b06137fdSPaul Mullowney if (cusparsestruct->handle) 75b06137fdSPaul Mullowney cusparsestruct->handle = 0; 76b06137fdSPaul Mullowney PetscFunctionReturn(0); 77b06137fdSPaul Mullowney } 78b06137fdSPaul Mullowney 79b06137fdSPaul Mullowney #undef __FUNCT__ 809ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 819ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 829ae82921SPaul Mullowney { 839ae82921SPaul Mullowney PetscFunctionBegin; 849ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 859ae82921SPaul Mullowney PetscFunctionReturn(0); 869ae82921SPaul Mullowney } 879ae82921SPaul Mullowney 88c708e6cdSJed Brown /*MC 89087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 90087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 91087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 92087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 93087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 94087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 95c708e6cdSJed Brown 969ae82921SPaul Mullowney Level: beginner 97c708e6cdSJed Brown 98c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 99c708e6cdSJed Brown M*/ 1009ae82921SPaul Mullowney 1019ae82921SPaul Mullowney #undef __FUNCT__ 1029ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 1033c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 1049ae82921SPaul Mullowney { 1059ae82921SPaul Mullowney PetscErrorCode ierr; 106bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1079ae82921SPaul Mullowney 1089ae82921SPaul Mullowney PetscFunctionBegin; 109bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 110404133a2SPaul Mullowney (*B)->factortype = ftype; 111bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1129ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1132205254eSKarl Rupp 114087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 11533d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1169ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1179ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 118087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 119087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 120087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1219ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 122bc3f50f2SPaul Mullowney 123fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 12462a20339SJed Brown ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 1259ae82921SPaul Mullowney PetscFunctionReturn(0); 1269ae82921SPaul Mullowney } 1279ae82921SPaul Mullowney 1289ae82921SPaul Mullowney #undef __FUNCT__ 129e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE" 130bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131ca45077fSPaul Mullowney { 132aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1336e111a19SKarl Rupp 134ca45077fSPaul Mullowney PetscFunctionBegin; 1352692e278SPaul Mullowney #if CUDA_VERSION>=4020 136ca45077fSPaul Mullowney switch (op) { 137e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 138aa372e3fSPaul Mullowney cusparsestruct->format = format; 139ca45077fSPaul Mullowney break; 140e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 141aa372e3fSPaul Mullowney cusparsestruct->format = format; 142ca45077fSPaul Mullowney break; 143ca45077fSPaul Mullowney default: 14436d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 145ca45077fSPaul Mullowney } 1462692e278SPaul Mullowney #else 1472692e278SPaul Mullowney if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) 1482692e278SPaul Mullowney SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 1492692e278SPaul Mullowney #endif 150ca45077fSPaul Mullowney PetscFunctionReturn(0); 151ca45077fSPaul Mullowney } 1529ae82921SPaul Mullowney 153e057df02SPaul Mullowney /*@ 154e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 155e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 156aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 157e057df02SPaul Mullowney Not Collective 158e057df02SPaul Mullowney 159e057df02SPaul Mullowney Input Parameters: 1608468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 16136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1622692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 163e057df02SPaul Mullowney 164e057df02SPaul Mullowney Output Parameter: 165e057df02SPaul Mullowney 166e057df02SPaul Mullowney Level: intermediate 167e057df02SPaul Mullowney 1688468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 169e057df02SPaul Mullowney @*/ 170e057df02SPaul Mullowney #undef __FUNCT__ 171e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat" 172e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 173e057df02SPaul Mullowney { 174e057df02SPaul Mullowney PetscErrorCode ierr; 1756e111a19SKarl Rupp 176e057df02SPaul Mullowney PetscFunctionBegin; 177e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 178e057df02SPaul Mullowney ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 179e057df02SPaul Mullowney PetscFunctionReturn(0); 180e057df02SPaul Mullowney } 181e057df02SPaul Mullowney 1829ae82921SPaul Mullowney #undef __FUNCT__ 1839ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1846fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1859ae82921SPaul Mullowney { 1869ae82921SPaul Mullowney PetscErrorCode ierr; 187e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1889ae82921SPaul Mullowney PetscBool flg; 1896e111a19SKarl Rupp 1909ae82921SPaul Mullowney PetscFunctionBegin; 191e057df02SPaul Mullowney ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr); 192e057df02SPaul Mullowney ierr = PetscObjectOptionsBegin((PetscObject)A); 1939ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 194e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 1957083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 196e057df02SPaul Mullowney if (flg) { 197e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 198045c96e1SPaul Mullowney } 1999ae82921SPaul Mullowney } 2004c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 2017083f85cSSatish Balay "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 2024c87dfd4SPaul Mullowney if (flg) { 2034c87dfd4SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 2044c87dfd4SPaul Mullowney } 2059ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 2069ae82921SPaul Mullowney PetscFunctionReturn(0); 2079ae82921SPaul Mullowney 2089ae82921SPaul Mullowney } 2099ae82921SPaul Mullowney 2109ae82921SPaul Mullowney #undef __FUNCT__ 2119ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 2126fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2139ae82921SPaul Mullowney { 2149ae82921SPaul Mullowney PetscErrorCode ierr; 2159ae82921SPaul Mullowney 2169ae82921SPaul Mullowney PetscFunctionBegin; 2179ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2189ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2199ae82921SPaul Mullowney PetscFunctionReturn(0); 2209ae82921SPaul Mullowney } 2219ae82921SPaul Mullowney 2229ae82921SPaul Mullowney #undef __FUNCT__ 2239ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 2246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2259ae82921SPaul Mullowney { 2269ae82921SPaul Mullowney PetscErrorCode ierr; 2279ae82921SPaul Mullowney 2289ae82921SPaul Mullowney PetscFunctionBegin; 2299ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2309ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2319ae82921SPaul Mullowney PetscFunctionReturn(0); 2329ae82921SPaul Mullowney } 2339ae82921SPaul Mullowney 2349ae82921SPaul Mullowney #undef __FUNCT__ 235087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE" 236087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 237087f3262SPaul Mullowney { 238087f3262SPaul Mullowney PetscErrorCode ierr; 239087f3262SPaul Mullowney 240087f3262SPaul Mullowney PetscFunctionBegin; 241087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 242087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 243087f3262SPaul Mullowney PetscFunctionReturn(0); 244087f3262SPaul Mullowney } 245087f3262SPaul Mullowney 246087f3262SPaul Mullowney #undef __FUNCT__ 247087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE" 248087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 249087f3262SPaul Mullowney { 250087f3262SPaul Mullowney PetscErrorCode ierr; 251087f3262SPaul Mullowney 252087f3262SPaul Mullowney PetscFunctionBegin; 253087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 254087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 255087f3262SPaul Mullowney PetscFunctionReturn(0); 256087f3262SPaul Mullowney } 257087f3262SPaul Mullowney 258087f3262SPaul Mullowney #undef __FUNCT__ 259087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix" 260087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2619ae82921SPaul Mullowney { 2629ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2639ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2649ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 265aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2669ae82921SPaul Mullowney cusparseStatus_t stat; 2679ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2689ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2699ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2709ae82921SPaul Mullowney PetscScalar *AALo; 2719ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 272b175d8bbSPaul Mullowney PetscErrorCode ierr; 2739ae82921SPaul Mullowney 2749ae82921SPaul Mullowney PetscFunctionBegin; 2759ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 2769ae82921SPaul Mullowney try { 2779ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2789ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2799ae82921SPaul Mullowney 2809ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2819ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2829ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2839ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2849ae82921SPaul Mullowney 2859ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2869ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2879ae82921SPaul Mullowney AiLo[n] = nzLower; 2889ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2899ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2909ae82921SPaul Mullowney v = aa; 2919ae82921SPaul Mullowney vi = aj; 2929ae82921SPaul Mullowney offset = 1; 2939ae82921SPaul Mullowney rowOffset= 1; 2949ae82921SPaul Mullowney for (i=1; i<n; i++) { 2959ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 296e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2979ae82921SPaul Mullowney AiLo[i] = rowOffset; 2989ae82921SPaul Mullowney rowOffset += nz+1; 2999ae82921SPaul Mullowney 3009ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3019ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3029ae82921SPaul Mullowney 3039ae82921SPaul Mullowney offset += nz; 3049ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3059ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3069ae82921SPaul Mullowney offset += 1; 3079ae82921SPaul Mullowney 3089ae82921SPaul Mullowney v += nz; 3099ae82921SPaul Mullowney vi += nz; 3109ae82921SPaul Mullowney } 3112205254eSKarl Rupp 312aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 313aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3142205254eSKarl Rupp 315aa372e3fSPaul Mullowney /* Create the matrix description */ 316aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 317aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 318aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 319aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 320aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 321aa372e3fSPaul Mullowney 322aa372e3fSPaul Mullowney /* Create the solve analysis information */ 323aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 324aa372e3fSPaul Mullowney 325aa372e3fSPaul Mullowney /* set the operation */ 326aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 327aa372e3fSPaul Mullowney 328aa372e3fSPaul Mullowney /* set the matrix */ 329aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 330aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 331aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 332aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 333aa372e3fSPaul Mullowney 334aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 335aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 336aa372e3fSPaul Mullowney 337aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 338aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 339aa372e3fSPaul Mullowney 340aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 341aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 342aa372e3fSPaul Mullowney 343aa372e3fSPaul Mullowney /* perform the solve analysis */ 344aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 345aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 346aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 347aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 348aa372e3fSPaul Mullowney 349aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 350aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3512205254eSKarl Rupp 3529ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 3539ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 3549ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 3559ae82921SPaul Mullowney } catch(char *ex) { 3569ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3579ae82921SPaul Mullowney } 3589ae82921SPaul Mullowney } 3599ae82921SPaul Mullowney PetscFunctionReturn(0); 3609ae82921SPaul Mullowney } 3619ae82921SPaul Mullowney 3629ae82921SPaul Mullowney #undef __FUNCT__ 363087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix" 364087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 3659ae82921SPaul Mullowney { 3669ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3679ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 369aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 3709ae82921SPaul Mullowney cusparseStatus_t stat; 3719ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 3729ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3739ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3749ae82921SPaul Mullowney PetscScalar *AAUp; 3759ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 3769ae82921SPaul Mullowney PetscErrorCode ierr; 3779ae82921SPaul Mullowney 3789ae82921SPaul Mullowney PetscFunctionBegin; 3799ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 3809ae82921SPaul Mullowney try { 3819ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3829ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 3859ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 3869ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 3879ae82921SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 3889ae82921SPaul Mullowney 3899ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 3909ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 3919ae82921SPaul Mullowney AiUp[n]=nzUpper; 3929ae82921SPaul Mullowney offset = nzUpper; 3939ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 3949ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 3959ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3969ae82921SPaul Mullowney 397e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 3989ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3999ae82921SPaul Mullowney 400e057df02SPaul Mullowney /* decrement the offset */ 4019ae82921SPaul Mullowney offset -= (nz+1); 4029ae82921SPaul Mullowney 403e057df02SPaul Mullowney /* first, set the diagonal elements */ 4049ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 4059ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 4069ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 4099ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 4109ae82921SPaul Mullowney } 4112205254eSKarl Rupp 412aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 413aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 4142205254eSKarl Rupp 415aa372e3fSPaul Mullowney /* Create the matrix description */ 416aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 417aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 418aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 419aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 420aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 421aa372e3fSPaul Mullowney 422aa372e3fSPaul Mullowney /* Create the solve analysis information */ 423aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 424aa372e3fSPaul Mullowney 425aa372e3fSPaul Mullowney /* set the operation */ 426aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 427aa372e3fSPaul Mullowney 428aa372e3fSPaul Mullowney /* set the matrix */ 429aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 430aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 431aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 432aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 433aa372e3fSPaul Mullowney 434aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 435aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 436aa372e3fSPaul Mullowney 437aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 438aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 439aa372e3fSPaul Mullowney 440aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 441aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 442aa372e3fSPaul Mullowney 443aa372e3fSPaul Mullowney /* perform the solve analysis */ 444aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 445aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 446aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 447aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 448aa372e3fSPaul Mullowney 449aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 450aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 4512205254eSKarl Rupp 4529ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 4539ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 4549ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 4559ae82921SPaul Mullowney } catch(char *ex) { 4569ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4579ae82921SPaul Mullowney } 4589ae82921SPaul Mullowney } 4599ae82921SPaul Mullowney PetscFunctionReturn(0); 4609ae82921SPaul Mullowney } 4619ae82921SPaul Mullowney 4629ae82921SPaul Mullowney #undef __FUNCT__ 463087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU" 464087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 4659ae82921SPaul Mullowney { 4669ae82921SPaul Mullowney PetscErrorCode ierr; 4679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4699ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 4709ae82921SPaul Mullowney PetscBool row_identity,col_identity; 4719ae82921SPaul Mullowney const PetscInt *r,*c; 4729ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4739ae82921SPaul Mullowney 4749ae82921SPaul Mullowney PetscFunctionBegin; 475087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 476087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 4772205254eSKarl Rupp 478aa372e3fSPaul Mullowney cusparseTriFactors->workVector = new THRUSTARRAY; 479aa372e3fSPaul Mullowney cusparseTriFactors->workVector->resize(n); 480aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 4819ae82921SPaul Mullowney 4829ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 483e057df02SPaul Mullowney /*lower triangular indices */ 4849ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4859ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4862205254eSKarl Rupp if (!row_identity) { 487aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 488aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 4892205254eSKarl Rupp } 4909ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4919ae82921SPaul Mullowney 492e057df02SPaul Mullowney /*upper triangular indices */ 4939ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 4949ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4952205254eSKarl Rupp if (!col_identity) { 496aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 497aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 4982205254eSKarl Rupp } 4999ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 5009ae82921SPaul Mullowney PetscFunctionReturn(0); 5019ae82921SPaul Mullowney } 5029ae82921SPaul Mullowney 5039ae82921SPaul Mullowney #undef __FUNCT__ 504087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices" 505087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 506087f3262SPaul Mullowney { 507087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 508087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 509aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 510aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 511087f3262SPaul Mullowney cusparseStatus_t stat; 512087f3262SPaul Mullowney PetscErrorCode ierr; 513087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 514087f3262SPaul Mullowney PetscScalar *AAUp; 515087f3262SPaul Mullowney PetscScalar *AALo; 516087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 517087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 518087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 519087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 520087f3262SPaul Mullowney 521087f3262SPaul Mullowney PetscFunctionBegin; 522087f3262SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 523087f3262SPaul Mullowney try { 524087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 525087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 526087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 527087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 528087f3262SPaul Mullowney ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 529087f3262SPaul Mullowney 530087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 531087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 532087f3262SPaul Mullowney AiUp[n]=nzUpper; 533087f3262SPaul Mullowney offset = 0; 534087f3262SPaul Mullowney for (i=0; i<n; i++) { 535087f3262SPaul Mullowney /* set the pointers */ 536087f3262SPaul Mullowney v = aa + ai[i]; 537087f3262SPaul Mullowney vj = aj + ai[i]; 538087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 539087f3262SPaul Mullowney 540087f3262SPaul Mullowney /* first, set the diagonal elements */ 541087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 542087f3262SPaul Mullowney AAUp[offset] = 1.0/v[nz]; 543087f3262SPaul Mullowney AiUp[i] = offset; 544087f3262SPaul Mullowney AALo[offset] = 1.0/v[nz]; 545087f3262SPaul Mullowney 546087f3262SPaul Mullowney offset+=1; 547087f3262SPaul Mullowney if (nz>0) { 548087f3262SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 549087f3262SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 550087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 551087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 552087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 553087f3262SPaul Mullowney } 554087f3262SPaul Mullowney offset+=nz; 555087f3262SPaul Mullowney } 556087f3262SPaul Mullowney } 557087f3262SPaul Mullowney 558aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 559aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 560087f3262SPaul Mullowney 561aa372e3fSPaul Mullowney /* Create the matrix description */ 562aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat); 563aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 564aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 565aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 566aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat); 567087f3262SPaul Mullowney 568aa372e3fSPaul Mullowney /* Create the solve analysis information */ 569aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat); 570aa372e3fSPaul Mullowney 571aa372e3fSPaul Mullowney /* set the operation */ 572aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 573aa372e3fSPaul Mullowney 574aa372e3fSPaul Mullowney /* set the matrix */ 575aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 576aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 577aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 578aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 579aa372e3fSPaul Mullowney 580aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 581aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 582aa372e3fSPaul Mullowney 583aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 584aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 585aa372e3fSPaul Mullowney 586aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 587aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 588aa372e3fSPaul Mullowney 589aa372e3fSPaul Mullowney /* perform the solve analysis */ 590aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat); 594aa372e3fSPaul Mullowney 595aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 596aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 597aa372e3fSPaul Mullowney 598aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 599aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 600aa372e3fSPaul Mullowney 601aa372e3fSPaul Mullowney /* Create the matrix description */ 602aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat); 603aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 604aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat); 605aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 606aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat); 607aa372e3fSPaul Mullowney 608aa372e3fSPaul Mullowney /* Create the solve analysis information */ 609aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat); 610aa372e3fSPaul Mullowney 611aa372e3fSPaul Mullowney /* set the operation */ 612aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 613aa372e3fSPaul Mullowney 614aa372e3fSPaul Mullowney /* set the matrix */ 615aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 616aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 617aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 618aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 619aa372e3fSPaul Mullowney 620aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 621aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 622aa372e3fSPaul Mullowney 623aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 624aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 625aa372e3fSPaul Mullowney 626aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 627aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 628aa372e3fSPaul Mullowney 629aa372e3fSPaul Mullowney /* perform the solve analysis */ 630aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 631aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 632aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 633aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat); 634aa372e3fSPaul Mullowney 635aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 636aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 637087f3262SPaul Mullowney 638087f3262SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 639087f3262SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 640087f3262SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 641087f3262SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 642087f3262SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 643087f3262SPaul Mullowney } catch(char *ex) { 644087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 645087f3262SPaul Mullowney } 646087f3262SPaul Mullowney } 647087f3262SPaul Mullowney PetscFunctionReturn(0); 648087f3262SPaul Mullowney } 649087f3262SPaul Mullowney 650087f3262SPaul Mullowney #undef __FUNCT__ 651087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU" 652087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 6539ae82921SPaul Mullowney { 6549ae82921SPaul Mullowney PetscErrorCode ierr; 655087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 656087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 657087f3262SPaul Mullowney IS ip = a->row; 658087f3262SPaul Mullowney const PetscInt *rip; 659087f3262SPaul Mullowney PetscBool perm_identity; 660087f3262SPaul Mullowney PetscInt n = A->rmap->n; 661087f3262SPaul Mullowney 662087f3262SPaul Mullowney PetscFunctionBegin; 663087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 664aa372e3fSPaul Mullowney cusparseTriFactors->workVector = new THRUSTARRAY; 665aa372e3fSPaul Mullowney cusparseTriFactors->workVector->resize(n); 666aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 667aa372e3fSPaul Mullowney 668087f3262SPaul Mullowney /*lower triangular indices */ 669087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 670087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 671087f3262SPaul Mullowney if (!perm_identity) { 672aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 673aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 674aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 675aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(rip, rip+n); 676087f3262SPaul Mullowney } 677087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 678087f3262SPaul Mullowney PetscFunctionReturn(0); 679087f3262SPaul Mullowney } 680087f3262SPaul Mullowney 681087f3262SPaul Mullowney #undef __FUNCT__ 6829ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 6836fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 6849ae82921SPaul Mullowney { 6859ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 6869ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 6879ae82921SPaul Mullowney PetscBool row_identity,col_identity; 688b175d8bbSPaul Mullowney PetscErrorCode ierr; 6899ae82921SPaul Mullowney 6909ae82921SPaul Mullowney PetscFunctionBegin; 6919ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 692e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 6939ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6949ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 695bda325fcSPaul Mullowney if (row_identity && col_identity) { 696bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 697bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 698bda325fcSPaul Mullowney } else { 699bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 700bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 701bda325fcSPaul Mullowney } 7028dc1d2a3SPaul Mullowney 703e057df02SPaul Mullowney /* get the triangular factors */ 704087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 7059ae82921SPaul Mullowney PetscFunctionReturn(0); 7069ae82921SPaul Mullowney } 7079ae82921SPaul Mullowney 708087f3262SPaul Mullowney #undef __FUNCT__ 709087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE" 710087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 711087f3262SPaul Mullowney { 712087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 713087f3262SPaul Mullowney IS ip = b->row; 714087f3262SPaul Mullowney PetscBool perm_identity; 715b175d8bbSPaul Mullowney PetscErrorCode ierr; 716087f3262SPaul Mullowney 717087f3262SPaul Mullowney PetscFunctionBegin; 718087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 719087f3262SPaul Mullowney 720087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 721087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 722087f3262SPaul Mullowney if (perm_identity) { 723087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 724087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 725087f3262SPaul Mullowney } else { 726087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 727087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 728087f3262SPaul Mullowney } 729087f3262SPaul Mullowney 730087f3262SPaul Mullowney /* get the triangular factors */ 731087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 732087f3262SPaul Mullowney PetscFunctionReturn(0); 733087f3262SPaul Mullowney } 7349ae82921SPaul Mullowney 735bda325fcSPaul Mullowney #undef __FUNCT__ 736bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve" 737b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 738bda325fcSPaul Mullowney { 739bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 740aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 741aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 742aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 743aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 744bda325fcSPaul Mullowney cusparseStatus_t stat; 745aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 746aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 747aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 748aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 749b175d8bbSPaul Mullowney 750bda325fcSPaul Mullowney PetscFunctionBegin; 751bda325fcSPaul Mullowney 752aa372e3fSPaul Mullowney /*********************************************/ 753aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 754aa372e3fSPaul Mullowney /*********************************************/ 755aa372e3fSPaul Mullowney 756aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 757aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 758aa372e3fSPaul Mullowney 759aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 760aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 761aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 762aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 763aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 764aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 765aa372e3fSPaul Mullowney 766aa372e3fSPaul Mullowney /* Create the matrix description */ 767aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat); 768aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat); 769aa372e3fSPaul Mullowney stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat); 770aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat); 771aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat); 772aa372e3fSPaul Mullowney 773aa372e3fSPaul Mullowney /* Create the solve analysis information */ 774aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat); 775aa372e3fSPaul Mullowney 776aa372e3fSPaul Mullowney /* set the operation */ 777aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 778aa372e3fSPaul Mullowney 779aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 780aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 781aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 782aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 783aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 784aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 785aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 786aa372e3fSPaul Mullowney loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 787aa372e3fSPaul Mullowney 788aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 789aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 790aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 791aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 792aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 793aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 794aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 795aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 796aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 797aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 798aa372e3fSPaul Mullowney 799aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 800aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 801aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 802aa372e3fSPaul Mullowney loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 803aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 804aa372e3fSPaul Mullowney loTriFactorT->solveInfo);CHKERRCUSP(stat); 805aa372e3fSPaul Mullowney 806aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 807aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 808aa372e3fSPaul Mullowney 809aa372e3fSPaul Mullowney /*********************************************/ 810aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 811aa372e3fSPaul Mullowney /*********************************************/ 812aa372e3fSPaul Mullowney 813aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 814aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 815aa372e3fSPaul Mullowney 816aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 817aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 818aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 819aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 820aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 821aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney /* Create the matrix description */ 824aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat); 825aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat); 826aa372e3fSPaul Mullowney stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat); 827aa372e3fSPaul Mullowney stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat); 828aa372e3fSPaul Mullowney stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat); 829aa372e3fSPaul Mullowney 830aa372e3fSPaul Mullowney /* Create the solve analysis information */ 831aa372e3fSPaul Mullowney stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat); 832aa372e3fSPaul Mullowney 833aa372e3fSPaul Mullowney /* set the operation */ 834aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 835aa372e3fSPaul Mullowney 836aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 837aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 838aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 839aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 840aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 841aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 842aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 843aa372e3fSPaul Mullowney upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 846aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 847aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 848aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 849aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 850aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 851aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 852aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 853aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 854aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 855aa372e3fSPaul Mullowney 856aa372e3fSPaul Mullowney /* perform the solve analysis on the transposed matrix */ 857aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 858aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 859aa372e3fSPaul Mullowney upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 860aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 861aa372e3fSPaul Mullowney upTriFactorT->solveInfo);CHKERRCUSP(stat); 862aa372e3fSPaul Mullowney 863aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 864aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 865bda325fcSPaul Mullowney PetscFunctionReturn(0); 866bda325fcSPaul Mullowney } 867bda325fcSPaul Mullowney 868bda325fcSPaul Mullowney #undef __FUNCT__ 869bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult" 870b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 871bda325fcSPaul Mullowney { 872aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 873aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 874aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 875bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 876bda325fcSPaul Mullowney cusparseStatus_t stat; 877aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 878b06137fdSPaul Mullowney cudaError_t err; 879b175d8bbSPaul Mullowney 880bda325fcSPaul Mullowney PetscFunctionBegin; 881aa372e3fSPaul Mullowney 882aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 883aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 884aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat); 885aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 886aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat); 887aa372e3fSPaul Mullowney stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 888aa372e3fSPaul Mullowney 889b06137fdSPaul Mullowney /* set alpha and beta */ 890b06137fdSPaul Mullowney err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 891b06137fdSPaul Mullowney err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 892b06137fdSPaul Mullowney err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err); 893b06137fdSPaul Mullowney err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 894b06137fdSPaul Mullowney stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 895b06137fdSPaul Mullowney 896aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 897aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 898aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 899aa372e3fSPaul Mullowney matrixT->num_rows = A->rmap->n; 900aa372e3fSPaul Mullowney matrixT->num_cols = A->cmap->n; 901aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 902aa372e3fSPaul Mullowney matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 903aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 904aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 905aa372e3fSPaul Mullowney 906aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 907aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 908aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 909aa372e3fSPaul Mullowney matrix->num_cols, matrix->num_entries, 910aa372e3fSPaul Mullowney matrix->values->data().get(), 911aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 912aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 913aa372e3fSPaul Mullowney matrixT->values->data().get(), 914aa372e3fSPaul Mullowney matrixT->column_indices->data().get(), 915aa372e3fSPaul Mullowney matrixT->row_offsets->data().get(), 916aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 917aa372e3fSPaul Mullowney 918aa372e3fSPaul Mullowney /* assign the pointer */ 919aa372e3fSPaul Mullowney matstructT->mat = matrixT; 920aa372e3fSPaul Mullowney 921aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 9222692e278SPaul Mullowney #if CUDA_VERSION>=5000 923aa372e3fSPaul Mullowney /* First convert HYB to CSR */ 924aa372e3fSPaul Mullowney CsrMatrix *temp= new CsrMatrix; 925aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 926aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 927aa372e3fSPaul Mullowney temp->num_entries = a->nz; 928aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 929aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 930aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 931aa372e3fSPaul Mullowney 9322692e278SPaul Mullowney 933aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 934aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 935aa372e3fSPaul Mullowney temp->values->data().get(), 936aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 937aa372e3fSPaul Mullowney temp->column_indices->data().get());CHKERRCUSP(stat); 938aa372e3fSPaul Mullowney 939aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 940aa372e3fSPaul Mullowney CsrMatrix *tempT= new CsrMatrix; 941aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 942aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 943aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 944aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 945aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 946aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 947aa372e3fSPaul Mullowney 948aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 949aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 950aa372e3fSPaul Mullowney temp->values->data().get(), 951aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 952aa372e3fSPaul Mullowney temp->column_indices->data().get(), 953aa372e3fSPaul Mullowney tempT->values->data().get(), 954aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 955aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 956aa372e3fSPaul Mullowney CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat); 957aa372e3fSPaul Mullowney 958aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 959aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 960aa372e3fSPaul Mullowney stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 961aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 962aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 963aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 964aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 965aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 966aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 967aa372e3fSPaul Mullowney hybMat, 0, partition);CHKERRCUSP(stat); 968aa372e3fSPaul Mullowney 969aa372e3fSPaul Mullowney /* assign the pointer */ 970aa372e3fSPaul Mullowney matstructT->mat = hybMat; 971aa372e3fSPaul Mullowney 972aa372e3fSPaul Mullowney /* delete temporaries */ 973aa372e3fSPaul Mullowney if (tempT) { 974aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 975aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 976aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 977aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 978087f3262SPaul Mullowney } 979aa372e3fSPaul Mullowney if (temp) { 980aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 981aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 982aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 983aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 984aa372e3fSPaul Mullowney } 9852692e278SPaul Mullowney #else 9862692e278SPaul Mullowney SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 9872692e278SPaul Mullowney #endif 988aa372e3fSPaul Mullowney } 989aa372e3fSPaul Mullowney /* assign the compressed row indices */ 990aa372e3fSPaul Mullowney matstructT->cprowIndices = new THRUSTINTARRAY; 991aa372e3fSPaul Mullowney 992aa372e3fSPaul Mullowney /* assign the pointer */ 993aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 994bda325fcSPaul Mullowney PetscFunctionReturn(0); 995bda325fcSPaul Mullowney } 996bda325fcSPaul Mullowney 997bda325fcSPaul Mullowney #undef __FUNCT__ 998bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE" 9996fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1000bda325fcSPaul Mullowney { 1001bda325fcSPaul Mullowney CUSPARRAY *xGPU, *bGPU; 1002bda325fcSPaul Mullowney cusparseStatus_t stat; 1003bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1004aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1005aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1006aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1007b175d8bbSPaul Mullowney PetscErrorCode ierr; 1008bda325fcSPaul Mullowney 1009bda325fcSPaul Mullowney PetscFunctionBegin; 1010aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1011aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1012bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1013aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1014aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1015bda325fcSPaul Mullowney } 1016bda325fcSPaul Mullowney 1017bda325fcSPaul Mullowney /* Get the GPU pointers */ 1018bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1019bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1020bda325fcSPaul Mullowney 1021aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1022aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1023aa372e3fSPaul Mullowney thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1024aa372e3fSPaul Mullowney xGPU->begin()); 1025aa372e3fSPaul Mullowney 1026aa372e3fSPaul Mullowney /* First, solve U */ 1027aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1028aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1029aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1030aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1031aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1032aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1033aa372e3fSPaul Mullowney xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1034aa372e3fSPaul Mullowney 1035aa372e3fSPaul Mullowney /* Then, solve L */ 1036aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1037aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1038aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1039aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1040aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1041aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1042aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1043aa372e3fSPaul Mullowney 1044aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1045aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1046aa372e3fSPaul Mullowney thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1047aa372e3fSPaul Mullowney tempGPU->begin()); 1048aa372e3fSPaul Mullowney 1049aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1050aa372e3fSPaul Mullowney thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 1051bda325fcSPaul Mullowney 1052bda325fcSPaul Mullowney /* restore */ 1053bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1054bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1055bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1056087f3262SPaul Mullowney 1057aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1058bda325fcSPaul Mullowney PetscFunctionReturn(0); 1059bda325fcSPaul Mullowney } 1060bda325fcSPaul Mullowney 1061bda325fcSPaul Mullowney #undef __FUNCT__ 1062bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering" 10636fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1064bda325fcSPaul Mullowney { 1065bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 1066bda325fcSPaul Mullowney cusparseStatus_t stat; 1067bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1068aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1069aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1070aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1071b175d8bbSPaul Mullowney PetscErrorCode ierr; 1072bda325fcSPaul Mullowney 1073bda325fcSPaul Mullowney PetscFunctionBegin; 1074aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1075aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1076bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1077aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1078aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1079bda325fcSPaul Mullowney } 1080bda325fcSPaul Mullowney 1081bda325fcSPaul Mullowney /* Get the GPU pointers */ 1082bda325fcSPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1083bda325fcSPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 1084bda325fcSPaul Mullowney 1085aa372e3fSPaul Mullowney /* First, solve U */ 1086aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1087aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1088aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1089aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1090aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1091aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1092aa372e3fSPaul Mullowney bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1093aa372e3fSPaul Mullowney 1094aa372e3fSPaul Mullowney /* Then, solve L */ 1095aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1096aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1097aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1098aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1099aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1100aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1101aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1102bda325fcSPaul Mullowney 1103bda325fcSPaul Mullowney /* restore */ 1104bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 1105bda325fcSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 1106bda325fcSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1107aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1108bda325fcSPaul Mullowney PetscFunctionReturn(0); 1109bda325fcSPaul Mullowney } 1110bda325fcSPaul Mullowney 11119ae82921SPaul Mullowney #undef __FUNCT__ 11129ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 11136fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 11149ae82921SPaul Mullowney { 1115bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 11169ae82921SPaul Mullowney cusparseStatus_t stat; 11179ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1118aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1119aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1120aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1121b175d8bbSPaul Mullowney PetscErrorCode ierr; 11229ae82921SPaul Mullowney 11239ae82921SPaul Mullowney PetscFunctionBegin; 1124e057df02SPaul Mullowney /* Get the GPU pointers */ 11259ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11269ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 11279ae82921SPaul Mullowney 1128aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1129aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()), 1130aa372e3fSPaul Mullowney thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()), 1131aa372e3fSPaul Mullowney xGPU->begin()); 1132aa372e3fSPaul Mullowney 1133aa372e3fSPaul Mullowney /* Next, solve L */ 1134aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1135aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1136aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1137aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1138aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1139aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1140aa372e3fSPaul Mullowney xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1141aa372e3fSPaul Mullowney 1142aa372e3fSPaul Mullowney /* Then, solve U */ 1143aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1144aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1145aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1146aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1147aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1148aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1149aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 1150aa372e3fSPaul Mullowney 1151aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1152aa372e3fSPaul Mullowney thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1153aa372e3fSPaul Mullowney thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()), 1154aa372e3fSPaul Mullowney tempGPU->begin()); 1155aa372e3fSPaul Mullowney 1156aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1157aa372e3fSPaul Mullowney thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin()); 11589ae82921SPaul Mullowney 11599ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 11609ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11619ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1162aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 11639ae82921SPaul Mullowney PetscFunctionReturn(0); 11649ae82921SPaul Mullowney } 11659ae82921SPaul Mullowney 11669ae82921SPaul Mullowney #undef __FUNCT__ 11679ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 11686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 11699ae82921SPaul Mullowney { 1170bda325fcSPaul Mullowney CUSPARRAY *xGPU,*bGPU; 11719ae82921SPaul Mullowney cusparseStatus_t stat; 11729ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1173aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1174aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1175aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1176b175d8bbSPaul Mullowney PetscErrorCode ierr; 11779ae82921SPaul Mullowney 11789ae82921SPaul Mullowney PetscFunctionBegin; 1179e057df02SPaul Mullowney /* Get the GPU pointers */ 11809ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 11819ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 11829ae82921SPaul Mullowney 1183aa372e3fSPaul Mullowney /* First, solve L */ 1184aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1185aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1186aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1187aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1188aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1189aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1190aa372e3fSPaul Mullowney bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat); 1191aa372e3fSPaul Mullowney 1192aa372e3fSPaul Mullowney /* Next, solve U */ 1193aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1194aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1195aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1196aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1197aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1198aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1199aa372e3fSPaul Mullowney tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat); 12009ae82921SPaul Mullowney 12019ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 12029ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 12039ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1204aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 12059ae82921SPaul Mullowney PetscFunctionReturn(0); 12069ae82921SPaul Mullowney } 12079ae82921SPaul Mullowney 12089ae82921SPaul Mullowney #undef __FUNCT__ 1209e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU" 12106fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 12119ae82921SPaul Mullowney { 12129ae82921SPaul Mullowney 1213aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1214aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 12159ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 12169ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 12179ae82921SPaul Mullowney PetscErrorCode ierr; 1218aa372e3fSPaul Mullowney cusparseStatus_t stat; 1219b06137fdSPaul Mullowney cudaError_t err; 12209ae82921SPaul Mullowney 12219ae82921SPaul Mullowney PetscFunctionBegin; 12229ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) { 12239ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1224aa372e3fSPaul Mullowney if (matstruct) { 1225aa372e3fSPaul Mullowney SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized."); 12269ae82921SPaul Mullowney } 12279ae82921SPaul Mullowney try { 1228aa372e3fSPaul Mullowney cusparsestruct->nonzerorow=0; 1229aa372e3fSPaul Mullowney for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 12309ae82921SPaul Mullowney 12319ae82921SPaul Mullowney if (a->compressedrow.use) { 12329ae82921SPaul Mullowney m = a->compressedrow.nrows; 12339ae82921SPaul Mullowney ii = a->compressedrow.i; 12349ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 12359ae82921SPaul Mullowney } else { 1236b06137fdSPaul Mullowney /* Forcing compressed row on the GPU */ 12379ae82921SPaul Mullowney int k=0; 1238785e854fSJed Brown ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr); 1239785e854fSJed Brown ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr); 12409ae82921SPaul Mullowney ii[0]=0; 12419ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 12429ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 12439ae82921SPaul Mullowney ii[k] = a->i[j]; 12449ae82921SPaul Mullowney ridx[k]= j; 12459ae82921SPaul Mullowney k++; 12469ae82921SPaul Mullowney } 12479ae82921SPaul Mullowney } 1248aa372e3fSPaul Mullowney ii[cusparsestruct->nonzerorow] = a->nz; 1249aa372e3fSPaul Mullowney m = cusparsestruct->nonzerorow; 12509ae82921SPaul Mullowney } 12519ae82921SPaul Mullowney 1252aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 1253aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1254aa372e3fSPaul Mullowney stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat); 1255aa372e3fSPaul Mullowney stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat); 1256aa372e3fSPaul Mullowney stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat); 12579ae82921SPaul Mullowney 1258b06137fdSPaul Mullowney err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err); 1259b06137fdSPaul Mullowney err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1260b06137fdSPaul Mullowney err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err); 1261b06137fdSPaul Mullowney err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err); 1262b06137fdSPaul Mullowney stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat); 1263b06137fdSPaul Mullowney 1264aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1265aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1266aa372e3fSPaul Mullowney /* set the matrix */ 1267aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1268a65300a6SPaul Mullowney matrix->num_rows = m; 1269aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1270aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1271a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1272a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 12739ae82921SPaul Mullowney 1274aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1275aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1276aa372e3fSPaul Mullowney 1277aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1278aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1279aa372e3fSPaul Mullowney 1280aa372e3fSPaul Mullowney /* assign the pointer */ 1281aa372e3fSPaul Mullowney matstruct->mat = matrix; 1282aa372e3fSPaul Mullowney 1283aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 12842692e278SPaul Mullowney #if CUDA_VERSION>=4020 1285aa372e3fSPaul Mullowney CsrMatrix *matrix= new CsrMatrix; 1286a65300a6SPaul Mullowney matrix->num_rows = m; 1287aa372e3fSPaul Mullowney matrix->num_cols = A->cmap->n; 1288aa372e3fSPaul Mullowney matrix->num_entries = a->nz; 1289a65300a6SPaul Mullowney matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1290a65300a6SPaul Mullowney matrix->row_offsets->assign(ii, ii + m+1); 1291aa372e3fSPaul Mullowney 1292aa372e3fSPaul Mullowney matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1293aa372e3fSPaul Mullowney matrix->column_indices->assign(a->j, a->j+a->nz); 1294aa372e3fSPaul Mullowney 1295aa372e3fSPaul Mullowney matrix->values = new THRUSTARRAY(a->nz); 1296aa372e3fSPaul Mullowney matrix->values->assign(a->a, a->a+a->nz); 1297aa372e3fSPaul Mullowney 1298aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 1299aa372e3fSPaul Mullowney stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat); 1300aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1301aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1302a65300a6SPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1303aa372e3fSPaul Mullowney matstruct->descr, matrix->values->data().get(), 1304aa372e3fSPaul Mullowney matrix->row_offsets->data().get(), 1305aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1306aa372e3fSPaul Mullowney hybMat, 0, partition);CHKERRCUSP(stat); 1307aa372e3fSPaul Mullowney /* assign the pointer */ 1308aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1309aa372e3fSPaul Mullowney 1310aa372e3fSPaul Mullowney if (matrix) { 1311aa372e3fSPaul Mullowney if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1312aa372e3fSPaul Mullowney if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1313aa372e3fSPaul Mullowney if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1314aa372e3fSPaul Mullowney delete (CsrMatrix*)matrix; 1315087f3262SPaul Mullowney } 13162692e278SPaul Mullowney #endif 1317087f3262SPaul Mullowney } 1318ca45077fSPaul Mullowney 1319aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1320aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1321aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1322aa372e3fSPaul Mullowney 1323aa372e3fSPaul Mullowney /* assign the pointer */ 1324aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 1325aa372e3fSPaul Mullowney 13269ae82921SPaul Mullowney if (!a->compressedrow.use) { 13279ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 13289ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 13299ae82921SPaul Mullowney } 1330aa372e3fSPaul Mullowney cusparsestruct->workVector = new THRUSTARRAY; 1331aa372e3fSPaul Mullowney cusparsestruct->workVector->resize(m); 13329ae82921SPaul Mullowney } catch(char *ex) { 13339ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 13349ae82921SPaul Mullowney } 13359ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 13362205254eSKarl Rupp 13379ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 13382205254eSKarl Rupp 13399ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 13409ae82921SPaul Mullowney } 13419ae82921SPaul Mullowney PetscFunctionReturn(0); 13429ae82921SPaul Mullowney } 13439ae82921SPaul Mullowney 13449ae82921SPaul Mullowney #undef __FUNCT__ 13452a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE" 13462a7a6963SBarry Smith static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 13479ae82921SPaul Mullowney { 13489ae82921SPaul Mullowney PetscErrorCode ierr; 134933d57670SJed Brown PetscInt rbs,cbs; 13509ae82921SPaul Mullowney 13519ae82921SPaul Mullowney PetscFunctionBegin; 135233d57670SJed Brown ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr); 13539ae82921SPaul Mullowney if (right) { 1354ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr); 13559ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 135633d57670SJed Brown ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr); 13579ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 13589ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 13599ae82921SPaul Mullowney } 13609ae82921SPaul Mullowney if (left) { 1361ce94432eSBarry Smith ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr); 13629ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 136333d57670SJed Brown ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr); 13649ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 13659ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 13669ae82921SPaul Mullowney } 13679ae82921SPaul Mullowney PetscFunctionReturn(0); 13689ae82921SPaul Mullowney } 13699ae82921SPaul Mullowney 1370aa372e3fSPaul Mullowney struct VecCUSPPlusEquals 1371aa372e3fSPaul Mullowney { 1372aa372e3fSPaul Mullowney template <typename Tuple> 1373aa372e3fSPaul Mullowney __host__ __device__ 1374aa372e3fSPaul Mullowney void operator()(Tuple t) 1375aa372e3fSPaul Mullowney { 1376aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1377aa372e3fSPaul Mullowney } 1378aa372e3fSPaul Mullowney }; 1379aa372e3fSPaul Mullowney 13809ae82921SPaul Mullowney #undef __FUNCT__ 13819ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 13826fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 13839ae82921SPaul Mullowney { 13849ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1385aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1386aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1387bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 1388b175d8bbSPaul Mullowney PetscErrorCode ierr; 1389aa372e3fSPaul Mullowney cusparseStatus_t stat; 13909ae82921SPaul Mullowney 13919ae82921SPaul Mullowney PetscFunctionBegin; 1392e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1393e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 13949ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 13959ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1396aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1397aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1398aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1399aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, mat->num_entries, 1400b06137fdSPaul Mullowney matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1401b06137fdSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1402aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 1403aa372e3fSPaul Mullowney } else { 14042692e278SPaul Mullowney #if CUDA_VERSION>=4020 1405aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1406aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1407b06137fdSPaul Mullowney matstruct->alpha, matstruct->descr, hybMat, 1408b06137fdSPaul Mullowney xarray->data().get(), matstruct->beta, 1409aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 14102692e278SPaul Mullowney #endif 14119ae82921SPaul Mullowney } 14129ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 14139ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1414aa372e3fSPaul Mullowney if (!cusparsestruct->stream) { 14159ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1416ca45077fSPaul Mullowney } 1417aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 14189ae82921SPaul Mullowney PetscFunctionReturn(0); 14199ae82921SPaul Mullowney } 14209ae82921SPaul Mullowney 14219ae82921SPaul Mullowney #undef __FUNCT__ 1422ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 14236fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1424ca45077fSPaul Mullowney { 1425ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1426aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1427aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1428bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray; 1429b175d8bbSPaul Mullowney PetscErrorCode ierr; 1430aa372e3fSPaul Mullowney cusparseStatus_t stat; 1431ca45077fSPaul Mullowney 1432ca45077fSPaul Mullowney PetscFunctionBegin; 1433e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1434e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1435aa372e3fSPaul Mullowney if (!matstructT) { 1436bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1437aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1438bda325fcSPaul Mullowney } 1439ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1440ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1441aa372e3fSPaul Mullowney 1442aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1443aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1444aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1445aa372e3fSPaul Mullowney mat->num_rows, mat->num_cols, 1446b06137fdSPaul Mullowney mat->num_entries, matstructT->alpha, matstructT->descr, 1447aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1448b06137fdSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1449aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 1450aa372e3fSPaul Mullowney } else { 14512692e278SPaul Mullowney #if CUDA_VERSION>=4020 1452aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1453aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1454b06137fdSPaul Mullowney matstructT->alpha, matstructT->descr, hybMat, 1455b06137fdSPaul Mullowney xarray->data().get(), matstructT->beta, 1456aa372e3fSPaul Mullowney yarray->data().get());CHKERRCUSP(stat); 14572692e278SPaul Mullowney #endif 1458ca45077fSPaul Mullowney } 1459ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1460ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1461aa372e3fSPaul Mullowney if (!cusparsestruct->stream) { 1462ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1463ca45077fSPaul Mullowney } 1464aa372e3fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1465ca45077fSPaul Mullowney PetscFunctionReturn(0); 1466ca45077fSPaul Mullowney } 1467ca45077fSPaul Mullowney 1468aa372e3fSPaul Mullowney 1469ca45077fSPaul Mullowney #undef __FUNCT__ 14709ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 14716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 14729ae82921SPaul Mullowney { 14739ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1474aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1475aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1476bda325fcSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 1477b175d8bbSPaul Mullowney PetscErrorCode ierr; 1478aa372e3fSPaul Mullowney cusparseStatus_t stat; 14796e111a19SKarl Rupp 14809ae82921SPaul Mullowney PetscFunctionBegin; 1481e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1482e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 14839ae82921SPaul Mullowney try { 14849ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 14859ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 14869ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 14879ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 14889ae82921SPaul Mullowney 1489e057df02SPaul Mullowney /* multiply add */ 1490aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1491aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1492b06137fdSPaul Mullowney /* here we need to be careful to set the number of rows in the multiply to the 1493b06137fdSPaul Mullowney number of compressed rows in the matrix ... which is equivalent to the 1494b06137fdSPaul Mullowney size of the workVector */ 1495aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1496a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 1497b06137fdSPaul Mullowney mat->num_entries, matstruct->alpha, matstruct->descr, 1498aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1499b06137fdSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), matstruct->beta, 1500aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1501aa372e3fSPaul Mullowney } else { 15022692e278SPaul Mullowney #if CUDA_VERSION>=4020 1503aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1504a65300a6SPaul Mullowney if (cusparsestruct->workVector->size()) { 1505aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1506b06137fdSPaul Mullowney matstruct->alpha, matstruct->descr, hybMat, 1507b06137fdSPaul Mullowney xarray->data().get(), matstruct->beta, 1508aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1509a65300a6SPaul Mullowney } 15102692e278SPaul Mullowney #endif 1511aa372e3fSPaul Mullowney } 1512aa372e3fSPaul Mullowney 1513aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 1514aa372e3fSPaul Mullowney thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))), 1515aa372e3fSPaul Mullowney thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1516aa372e3fSPaul Mullowney VecCUSPPlusEquals()); 15179ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 15189ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 15199ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 15209ae82921SPaul Mullowney 15219ae82921SPaul Mullowney } catch(char *ex) { 15229ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 15239ae82921SPaul Mullowney } 15249ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 15259ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 15269ae82921SPaul Mullowney PetscFunctionReturn(0); 15279ae82921SPaul Mullowney } 15289ae82921SPaul Mullowney 15299ae82921SPaul Mullowney #undef __FUNCT__ 1530b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE" 15316fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1532ca45077fSPaul Mullowney { 1533ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1534aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1535aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1536ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 1537b175d8bbSPaul Mullowney PetscErrorCode ierr; 1538aa372e3fSPaul Mullowney cusparseStatus_t stat; 15396e111a19SKarl Rupp 1540ca45077fSPaul Mullowney PetscFunctionBegin; 1541e057df02SPaul Mullowney /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 1542e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */ 1543aa372e3fSPaul Mullowney if (!matstructT) { 1544bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1545aa372e3fSPaul Mullowney matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1546bda325fcSPaul Mullowney } 1547aa372e3fSPaul Mullowney 1548ca45077fSPaul Mullowney try { 1549ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 1550ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1551ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 1552ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1553ca45077fSPaul Mullowney 1554e057df02SPaul Mullowney /* multiply add with matrix transpose */ 1555aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1556aa372e3fSPaul Mullowney CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1557b06137fdSPaul Mullowney /* here we need to be careful to set the number of rows in the multiply to the 1558b06137fdSPaul Mullowney number of compressed rows in the matrix ... which is equivalent to the 1559b06137fdSPaul Mullowney size of the workVector */ 1560aa372e3fSPaul Mullowney stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1561a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 1562b06137fdSPaul Mullowney mat->num_entries, matstructT->alpha, matstructT->descr, 1563aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 1564b06137fdSPaul Mullowney mat->column_indices->data().get(), xarray->data().get(), matstructT->beta, 1565aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1566aa372e3fSPaul Mullowney } else { 15672692e278SPaul Mullowney #if CUDA_VERSION>=4020 1568aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1569a65300a6SPaul Mullowney if (cusparsestruct->workVector->size()) { 1570aa372e3fSPaul Mullowney stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1571b06137fdSPaul Mullowney matstructT->alpha, matstructT->descr, hybMat, 1572b06137fdSPaul Mullowney xarray->data().get(), matstructT->beta, 1573aa372e3fSPaul Mullowney cusparsestruct->workVector->data().get());CHKERRCUSP(stat); 1574a65300a6SPaul Mullowney } 15752692e278SPaul Mullowney #endif 1576aa372e3fSPaul Mullowney } 1577aa372e3fSPaul Mullowney 1578aa372e3fSPaul Mullowney /* scatter the data from the temporary into the full vector with a += operation */ 1579aa372e3fSPaul Mullowney thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))), 1580aa372e3fSPaul Mullowney thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1581aa372e3fSPaul Mullowney VecCUSPPlusEquals()); 1582ca45077fSPaul Mullowney 1583ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1584ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 1585ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1586ca45077fSPaul Mullowney 1587ca45077fSPaul Mullowney } catch(char *ex) { 1588ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1589ca45077fSPaul Mullowney } 1590ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 1591ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1592ca45077fSPaul Mullowney PetscFunctionReturn(0); 1593ca45077fSPaul Mullowney } 1594ca45077fSPaul Mullowney 1595ca45077fSPaul Mullowney #undef __FUNCT__ 15969ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 15976fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 15989ae82921SPaul Mullowney { 15999ae82921SPaul Mullowney PetscErrorCode ierr; 16006e111a19SKarl Rupp 16019ae82921SPaul Mullowney PetscFunctionBegin; 16029ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1603bc3f50f2SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 1604e057df02SPaul Mullowney ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1605bc3f50f2SPaul Mullowney } 16069ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1607bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 1608bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1609bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1610bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 16119ae82921SPaul Mullowney PetscFunctionReturn(0); 16129ae82921SPaul Mullowney } 16139ae82921SPaul Mullowney 16149ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 16159ae82921SPaul Mullowney #undef __FUNCT__ 16169ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 1617e057df02SPaul Mullowney /*@ 16189ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1619e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 1620e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1621e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1622e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1623e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 16249ae82921SPaul Mullowney 16259ae82921SPaul Mullowney Collective on MPI_Comm 16269ae82921SPaul Mullowney 16279ae82921SPaul Mullowney Input Parameters: 16289ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 16299ae82921SPaul Mullowney . m - number of rows 16309ae82921SPaul Mullowney . n - number of columns 16319ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 16329ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 16330298fd71SBarry Smith (possibly different for each row) or NULL 16349ae82921SPaul Mullowney 16359ae82921SPaul Mullowney Output Parameter: 16369ae82921SPaul Mullowney . A - the matrix 16379ae82921SPaul Mullowney 16389ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 16399ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 16409ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 16419ae82921SPaul Mullowney 16429ae82921SPaul Mullowney Notes: 16439ae82921SPaul Mullowney If nnz is given then nz is ignored 16449ae82921SPaul Mullowney 16459ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 16469ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 16479ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 16489ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 16499ae82921SPaul Mullowney 16509ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 16510298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 16529ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 16539ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 16549ae82921SPaul Mullowney 16559ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 16569ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 16579ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 16589ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 16599ae82921SPaul Mullowney 16609ae82921SPaul Mullowney Level: intermediate 16619ae82921SPaul Mullowney 1662e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 16639ae82921SPaul Mullowney @*/ 16649ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 16659ae82921SPaul Mullowney { 16669ae82921SPaul Mullowney PetscErrorCode ierr; 16679ae82921SPaul Mullowney 16689ae82921SPaul Mullowney PetscFunctionBegin; 16699ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 16709ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 16719ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 16729ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 16739ae82921SPaul Mullowney PetscFunctionReturn(0); 16749ae82921SPaul Mullowney } 16759ae82921SPaul Mullowney 16769ae82921SPaul Mullowney #undef __FUNCT__ 16779ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 16786fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 16799ae82921SPaul Mullowney { 16809ae82921SPaul Mullowney PetscErrorCode ierr; 1681aa372e3fSPaul Mullowney cusparseStatus_t stat; 1682b06137fdSPaul Mullowney cudaError_t err; 16839ae82921SPaul Mullowney PetscFunctionBegin; 16849ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 16859ae82921SPaul Mullowney try { 16869ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) { 1687aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1688aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1689aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1690aa372e3fSPaul Mullowney 1691aa372e3fSPaul Mullowney /* delete all the members in each of the data structures */ 1692aa372e3fSPaul Mullowney if (matstruct) { 1693aa372e3fSPaul Mullowney if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); } 1694aa372e3fSPaul Mullowney if (matstruct->mat) { 1695aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 16962692e278SPaul Mullowney #if CUDA_VERSION>=4020 1697aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat; 1698aa372e3fSPaul Mullowney stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 16992692e278SPaul Mullowney #endif 1700aa372e3fSPaul Mullowney } else { 1701aa372e3fSPaul Mullowney CsrMatrix* mat = (CsrMatrix*)matstruct->mat; 1702aa372e3fSPaul Mullowney if (mat->values) delete (THRUSTARRAY*)mat->values; 1703aa372e3fSPaul Mullowney if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1704aa372e3fSPaul Mullowney if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1705aa372e3fSPaul Mullowney delete (CsrMatrix*)matstruct->mat; 17069ae82921SPaul Mullowney } 1707aa372e3fSPaul Mullowney } 1708b06137fdSPaul Mullowney if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); } 1709b06137fdSPaul Mullowney if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); } 1710aa372e3fSPaul Mullowney if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices; 1711aa372e3fSPaul Mullowney if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct; 1712aa372e3fSPaul Mullowney } 1713aa372e3fSPaul Mullowney if (matstructT) { 1714aa372e3fSPaul Mullowney if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); } 1715aa372e3fSPaul Mullowney if (matstructT->mat) { 1716aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 17172692e278SPaul Mullowney #if CUDA_VERSION>=4020 1718aa372e3fSPaul Mullowney cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat; 1719aa372e3fSPaul Mullowney stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 17202692e278SPaul Mullowney #endif 1721aa372e3fSPaul Mullowney } else { 1722aa372e3fSPaul Mullowney CsrMatrix* matT = (CsrMatrix*)matstructT->mat; 1723aa372e3fSPaul Mullowney if (matT->values) delete (THRUSTARRAY*)matT->values; 1724aa372e3fSPaul Mullowney if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices; 1725aa372e3fSPaul Mullowney if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets; 1726aa372e3fSPaul Mullowney delete (CsrMatrix*)matstructT->mat; 1727aa372e3fSPaul Mullowney } 1728aa372e3fSPaul Mullowney } 1729b06137fdSPaul Mullowney if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); } 1730b06137fdSPaul Mullowney if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); } 1731aa372e3fSPaul Mullowney if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices; 1732aa372e3fSPaul Mullowney if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT; 1733aa372e3fSPaul Mullowney } 1734aa372e3fSPaul Mullowney if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector; 1735aa372e3fSPaul Mullowney if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1736aa372e3fSPaul Mullowney if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 17379ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1738aa372e3fSPaul Mullowney } else { 1739aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1740aa372e3fSPaul Mullowney if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); } 1741aa372e3fSPaul Mullowney if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct; 1742aa372e3fSPaul Mullowney } 17439ae82921SPaul Mullowney } catch(char *ex) { 17449ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 17459ae82921SPaul Mullowney } 17469ae82921SPaul Mullowney } else { 1747e057df02SPaul Mullowney /* The triangular factors */ 17489ae82921SPaul Mullowney try { 17499ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1750aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1751aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1752aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1753aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1754aa372e3fSPaul Mullowney 1755aa372e3fSPaul Mullowney /* delete all the members in each of the data structures */ 1756aa372e3fSPaul Mullowney if (loTriFactor) { 1757aa372e3fSPaul Mullowney if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); } 1758aa372e3fSPaul Mullowney if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); } 1759aa372e3fSPaul Mullowney if (loTriFactor->csrMat) { 1760aa372e3fSPaul Mullowney if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values; 1761aa372e3fSPaul Mullowney if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices; 1762aa372e3fSPaul Mullowney if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets; 1763aa372e3fSPaul Mullowney delete (CsrMatrix*)loTriFactor->csrMat; 1764aa372e3fSPaul Mullowney } 1765aa372e3fSPaul Mullowney if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor; 1766aa372e3fSPaul Mullowney } 1767aa372e3fSPaul Mullowney 1768aa372e3fSPaul Mullowney if (upTriFactor) { 1769aa372e3fSPaul Mullowney if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); } 1770aa372e3fSPaul Mullowney if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); } 1771aa372e3fSPaul Mullowney if (upTriFactor->csrMat) { 1772aa372e3fSPaul Mullowney if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values; 1773aa372e3fSPaul Mullowney if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices; 1774aa372e3fSPaul Mullowney if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets; 1775aa372e3fSPaul Mullowney delete (CsrMatrix*)upTriFactor->csrMat; 1776aa372e3fSPaul Mullowney } 1777aa372e3fSPaul Mullowney if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor; 1778aa372e3fSPaul Mullowney } 1779aa372e3fSPaul Mullowney 1780aa372e3fSPaul Mullowney if (loTriFactorT) { 1781aa372e3fSPaul Mullowney if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); } 1782aa372e3fSPaul Mullowney if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); } 1783aa372e3fSPaul Mullowney if (loTriFactorT->csrMat) { 1784aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values; 1785aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices; 1786aa372e3fSPaul Mullowney if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets; 1787aa372e3fSPaul Mullowney delete (CsrMatrix*)loTriFactorT->csrMat; 1788aa372e3fSPaul Mullowney } 1789aa372e3fSPaul Mullowney if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT; 1790aa372e3fSPaul Mullowney } 1791aa372e3fSPaul Mullowney 1792aa372e3fSPaul Mullowney if (upTriFactorT) { 1793aa372e3fSPaul Mullowney if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); } 1794aa372e3fSPaul Mullowney if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); } 1795aa372e3fSPaul Mullowney if (upTriFactorT->csrMat) { 1796aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values; 1797aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices; 1798aa372e3fSPaul Mullowney if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets; 1799aa372e3fSPaul Mullowney delete (CsrMatrix*)upTriFactorT->csrMat; 1800aa372e3fSPaul Mullowney } 1801aa372e3fSPaul Mullowney if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT; 1802aa372e3fSPaul Mullowney } 1803aa372e3fSPaul Mullowney if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices; 1804aa372e3fSPaul Mullowney if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices; 1805aa372e3fSPaul Mullowney if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector; 1806aa372e3fSPaul Mullowney if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); } 1807aa372e3fSPaul Mullowney if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors; 18089ae82921SPaul Mullowney } catch(char *ex) { 18099ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18109ae82921SPaul Mullowney } 18119ae82921SPaul Mullowney } 18129ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 18139ae82921SPaul Mullowney A->spptr = 0; 18149ae82921SPaul Mullowney 18159ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 18169ae82921SPaul Mullowney PetscFunctionReturn(0); 18179ae82921SPaul Mullowney } 18189ae82921SPaul Mullowney 18199ae82921SPaul Mullowney #undef __FUNCT__ 18209ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 18218cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 18229ae82921SPaul Mullowney { 18239ae82921SPaul Mullowney PetscErrorCode ierr; 1824aa372e3fSPaul Mullowney cusparseStatus_t stat; 1825aa372e3fSPaul Mullowney cusparseHandle_t handle=0; 18269ae82921SPaul Mullowney 18279ae82921SPaul Mullowney PetscFunctionBegin; 18289ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 18299ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 1830e057df02SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created. 1831e057df02SPaul Mullowney now build a GPU matrix data structure */ 18329ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 18339ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1834aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1835aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1836e057df02SPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1837aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1838aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1839aa372e3fSPaul Mullowney stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1840aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1841aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 18429ae82921SPaul Mullowney } else { 18439ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 1844debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 18459ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 18469ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1847aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1848aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1849aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1850aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1851aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1852aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1853aa372e3fSPaul Mullowney stat = cusparseCreate(&handle);CHKERRCUSP(stat); 1854aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1855aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 18569ae82921SPaul Mullowney } 1857aa372e3fSPaul Mullowney 1858e057df02SPaul Mullowney /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 1859e057df02SPaul Mullowney default cusparse tri solve. Note the difference with the implementation in 1860e057df02SPaul Mullowney MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */ 1861bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 18622205254eSKarl Rupp 18639ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 18649ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 18652a7a6963SBarry Smith B->ops->getvecs = MatCreateVecs_SeqAIJCUSPARSE; 18669ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1867ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 1868ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1869ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1870ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 18712205254eSKarl Rupp 18729ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 18732205254eSKarl Rupp 18749ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 18752205254eSKarl Rupp 1876bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 18779ae82921SPaul Mullowney PetscFunctionReturn(0); 18789ae82921SPaul Mullowney } 18799ae82921SPaul Mullowney 1880e057df02SPaul Mullowney /*M 1881e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1882e057df02SPaul Mullowney 1883e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 18842692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 18852692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1886e057df02SPaul Mullowney 1887e057df02SPaul Mullowney Options Database Keys: 1888e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1889aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1890aa372e3fSPaul Mullowney . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1891e057df02SPaul Mullowney 1892e057df02SPaul Mullowney Level: beginner 1893e057df02SPaul Mullowney 18948468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1895e057df02SPaul Mullowney M*/ 1896*7f756511SDominic Meiser 1897*7f756511SDominic Meiser #undef __FUNCT__ 1898*7f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy" 1899*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1900*7f756511SDominic Meiser { 1901*7f756511SDominic Meiser cusparseStatus_t stat; 1902*7f756511SDominic Meiser cusparseHandle_t handle; 1903*7f756511SDominic Meiser 1904*7f756511SDominic Meiser PetscFunctionBegin; 1905*7f756511SDominic Meiser if (*cusparsestruct) { 1906*7f756511SDominic Meiser Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1907*7f756511SDominic Meiser Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1908*7f756511SDominic Meiser delete (*cusparsestruct)->workVector; 1909*7f756511SDominic Meiser if (handle = (*cusparsestruct)->handle) { 1910*7f756511SDominic Meiser stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1911*7f756511SDominic Meiser } 1912*7f756511SDominic Meiser delete *cusparsestruct; 1913*7f756511SDominic Meiser *cusparsestruct = 0; 1914*7f756511SDominic Meiser } 1915*7f756511SDominic Meiser PetscFunctionReturn(0); 1916*7f756511SDominic Meiser } 1917*7f756511SDominic Meiser 1918*7f756511SDominic Meiser #undef __FUNCT__ 1919*7f756511SDominic Meiser #define __FUNCT__ "CsrMatrix_Destroy" 1920*7f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1921*7f756511SDominic Meiser { 1922*7f756511SDominic Meiser PetscFunctionBegin; 1923*7f756511SDominic Meiser if (*mat) { 1924*7f756511SDominic Meiser delete (*mat)->values; 1925*7f756511SDominic Meiser delete (*mat)->column_indices; 1926*7f756511SDominic Meiser delete (*mat)->row_offsets; 1927*7f756511SDominic Meiser delete *mat; 1928*7f756511SDominic Meiser *mat = 0; 1929*7f756511SDominic Meiser } 1930*7f756511SDominic Meiser PetscFunctionReturn(0); 1931*7f756511SDominic Meiser } 1932*7f756511SDominic Meiser 1933*7f756511SDominic Meiser #undef __FUNCT__ 1934*7f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy" 1935*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1936*7f756511SDominic Meiser { 1937*7f756511SDominic Meiser cusparseStatus_t stat; 1938*7f756511SDominic Meiser PetscErrorCode ierr; 1939*7f756511SDominic Meiser 1940*7f756511SDominic Meiser PetscFunctionBegin; 1941*7f756511SDominic Meiser if (*trifactor) { 1942*7f756511SDominic Meiser if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); } 1943*7f756511SDominic Meiser if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); } 1944*7f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1945*7f756511SDominic Meiser delete *trifactor; 1946*7f756511SDominic Meiser *trifactor = 0; 1947*7f756511SDominic Meiser } 1948*7f756511SDominic Meiser PetscFunctionReturn(0); 1949*7f756511SDominic Meiser } 1950*7f756511SDominic Meiser 1951*7f756511SDominic Meiser #undef __FUNCT__ 1952*7f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy" 1953*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1954*7f756511SDominic Meiser { 1955*7f756511SDominic Meiser CsrMatrix *mat; 1956*7f756511SDominic Meiser cusparseStatus_t stat; 1957*7f756511SDominic Meiser cudaError_t err; 1958*7f756511SDominic Meiser 1959*7f756511SDominic Meiser PetscFunctionBegin; 1960*7f756511SDominic Meiser if (*matstruct) { 1961*7f756511SDominic Meiser if ((*matstruct)->mat) { 1962*7f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1963*7f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1964*7f756511SDominic Meiser stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat); 1965*7f756511SDominic Meiser } else { 1966*7f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 1967*7f756511SDominic Meiser CsrMatrix_Destroy(&mat); 1968*7f756511SDominic Meiser } 1969*7f756511SDominic Meiser } 1970*7f756511SDominic Meiser if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); } 1971*7f756511SDominic Meiser delete (*matstruct)->cprowIndices; 1972*7f756511SDominic Meiser if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); } 1973*7f756511SDominic Meiser if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); } 1974*7f756511SDominic Meiser delete *matstruct; 1975*7f756511SDominic Meiser *matstruct = 0; 1976*7f756511SDominic Meiser } 1977*7f756511SDominic Meiser PetscFunctionReturn(0); 1978*7f756511SDominic Meiser } 1979*7f756511SDominic Meiser 1980*7f756511SDominic Meiser #undef __FUNCT__ 1981*7f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy" 1982*7f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1983*7f756511SDominic Meiser { 1984*7f756511SDominic Meiser cusparseHandle_t handle; 1985*7f756511SDominic Meiser cusparseStatus_t stat; 1986*7f756511SDominic Meiser 1987*7f756511SDominic Meiser PetscFunctionBegin; 1988*7f756511SDominic Meiser if (*trifactors) { 1989*7f756511SDominic Meiser Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1990*7f756511SDominic Meiser Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1991*7f756511SDominic Meiser Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1992*7f756511SDominic Meiser Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1993*7f756511SDominic Meiser delete (*trifactors)->rpermIndices; 1994*7f756511SDominic Meiser delete (*trifactors)->cpermIndices; 1995*7f756511SDominic Meiser delete (*trifactors)->workVector; 1996*7f756511SDominic Meiser if (handle = (*trifactors)->handle) { 1997*7f756511SDominic Meiser stat = cusparseDestroy(handle);CHKERRCUSP(stat); 1998*7f756511SDominic Meiser } 1999*7f756511SDominic Meiser delete *trifactors; 2000*7f756511SDominic Meiser *trifactors = 0; 2001*7f756511SDominic Meiser } 2002*7f756511SDominic Meiser PetscFunctionReturn(0); 2003*7f756511SDominic Meiser } 2004*7f756511SDominic Meiser 2005