19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16*d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 17*d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 18*d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 19a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 20*d0967f54SJacob Faibussowitsch #endif 21a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 22a2cee5feSJed Brown #include <thrust/remove.h> 23a2cee5feSJed Brown #include <thrust/sort.h> 24a2cee5feSJed Brown #include <thrust/unique.h> 25e8d2b73aSMark Adams 26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30afb2bd1cSJunchao Zhang 31afb2bd1cSJunchao Zhang typedef enum { 32afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 35afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 36afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 37afb2bd1cSJunchao Zhang 38afb2bd1cSJunchao Zhang typedef enum { 39afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 47afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 48afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 49afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 50afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 51afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 52afb2bd1cSJunchao Zhang 53afb2bd1cSJunchao Zhang typedef enum { 54afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 55afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 56afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 57afb2bd1cSJunchao Zhang */ 58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61afb2bd1cSJunchao Zhang #endif 629ae82921SPaul Mullowney 63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 686fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 696fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 70087f3262SPaul Mullowney 716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 726fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 746fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 75dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 76a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 796fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 806fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 816fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 859ae82921SPaul Mullowney 867f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 917f756511SDominic Meiser 9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9457181aedSStefano Zampini 95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98c215019aSStefano Zampini 999371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 100aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1016e111a19SKarl Rupp 102ca45077fSPaul Mullowney PetscFunctionBegin; 103ca45077fSPaul Mullowney switch (op) { 1049371c9d4SSatish Balay case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break; 1059371c9d4SSatish Balay case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break; 1069371c9d4SSatish Balay default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 107ca45077fSPaul Mullowney } 108ca45077fSPaul Mullowney PetscFunctionReturn(0); 109ca45077fSPaul Mullowney } 1109ae82921SPaul Mullowney 111e057df02SPaul Mullowney /*@ 11211a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 11311a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 11411a5261eSBarry Smith 115e057df02SPaul Mullowney Not Collective 116e057df02SPaul Mullowney 117e057df02SPaul Mullowney Input Parameters: 11811a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 11911a5261eSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 12011a5261eSBarry Smith `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 12111a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 122e057df02SPaul Mullowney 123e057df02SPaul Mullowney Output Parameter: 124e057df02SPaul Mullowney 125e057df02SPaul Mullowney Level: intermediate 126e057df02SPaul Mullowney 12711a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 128e057df02SPaul Mullowney @*/ 1299371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 130e057df02SPaul Mullowney PetscFunctionBegin; 131e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 132cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 133e057df02SPaul Mullowney PetscFunctionReturn(0); 134e057df02SPaul Mullowney } 135e057df02SPaul Mullowney 1369371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) { 137365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 138365b711fSMark Adams 139365b711fSMark Adams PetscFunctionBegin; 140365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 141365b711fSMark Adams PetscFunctionReturn(0); 142365b711fSMark Adams } 143365b711fSMark Adams 144365b711fSMark Adams /*@ 14511a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 146365b711fSMark Adams 147365b711fSMark Adams Input Parameters: 14811a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 14911a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 150365b711fSMark Adams 151365b711fSMark Adams Output Parameter: 152365b711fSMark Adams 15311a5261eSBarry Smith Note: 154365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 155365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 156365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 157365b711fSMark Adams 158365b711fSMark Adams Level: intermediate 159365b711fSMark Adams 16011a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 161365b711fSMark Adams @*/ 1629371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) { 163365b711fSMark Adams PetscFunctionBegin; 164365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 165cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 166365b711fSMark Adams PetscFunctionReturn(0); 167365b711fSMark Adams } 168365b711fSMark Adams 1699371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) { 170e6e9a74fSStefano Zampini PetscFunctionBegin; 1711a2c6b5cSJunchao Zhang switch (op) { 1721a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1731a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1749566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1751a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1761a2c6b5cSJunchao Zhang break; 1779371c9d4SSatish Balay default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 178e6e9a74fSStefano Zampini } 179e6e9a74fSStefano Zampini PetscFunctionReturn(0); 180e6e9a74fSStefano Zampini } 181e6e9a74fSStefano Zampini 182bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 183bddcd29dSMark Adams 1849371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 185bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 186bddcd29dSMark Adams IS isrow = b->row, iscol = b->col; 187bddcd29dSMark Adams PetscBool row_identity, col_identity; 188365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 189bddcd29dSMark Adams 190bddcd29dSMark Adams PetscFunctionBegin; 1919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1929566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 193bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 194bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 1959566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 1969566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 197f93f8571SJunchao Zhang 198365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 199f93f8571SJunchao Zhang if (row_identity && col_identity) { 200bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 201bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 202bddcd29dSMark Adams } else { 203bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 204bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 205365b711fSMark Adams } 206f93f8571SJunchao Zhang } 207bddcd29dSMark Adams B->ops->matsolve = NULL; 208bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 209bddcd29dSMark Adams 210bddcd29dSMark Adams /* get the triangular factors */ 21148a46eb9SPierre Jolivet if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 212bddcd29dSMark Adams PetscFunctionReturn(0); 213bddcd29dSMark Adams } 214bddcd29dSMark Adams 2159371c9d4SSatish Balay static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 216e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2179ae82921SPaul Mullowney PetscBool flg; 218a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2196e111a19SKarl Rupp 2209ae82921SPaul Mullowney PetscFunctionBegin; 221d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 2229ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 2239371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2249566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 225afb2bd1cSJunchao Zhang 2269371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2279566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2289566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2299566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 230afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2319371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 232afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 233ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 234aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 235a435da06SStefano Zampini #else 236aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 237a435da06SStefano Zampini #endif 2389371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 239aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 240afb2bd1cSJunchao Zhang 2419371c9d4SSatish Balay PetscCall( 2429371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 243aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 244afb2bd1cSJunchao Zhang #endif 2454c87dfd4SPaul Mullowney } 246d0609cedSBarry Smith PetscOptionsHeadEnd(); 2479ae82921SPaul Mullowney PetscFunctionReturn(0); 2489ae82921SPaul Mullowney } 2499ae82921SPaul Mullowney 2509371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) { 2519ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2529ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2539ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 254aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 2559ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 2569ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 2579ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2589ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 2599ae82921SPaul Mullowney 2609ae82921SPaul Mullowney PetscFunctionBegin; 261cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 262c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2639ae82921SPaul Mullowney try { 2649ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2659ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 266da79fbbcSStefano Zampini if (!loTriFactor) { 2672cbc15d9SMark PetscScalar *AALo; 2682cbc15d9SMark 2699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 2709ae82921SPaul Mullowney 2719ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 2739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 2749ae82921SPaul Mullowney 2759ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2769ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 2779ae82921SPaul Mullowney AiLo[n] = nzLower; 2789ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 2799ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 2809ae82921SPaul Mullowney v = aa; 2819ae82921SPaul Mullowney vi = aj; 2829ae82921SPaul Mullowney offset = 1; 2839ae82921SPaul Mullowney rowOffset = 1; 2849ae82921SPaul Mullowney for (i = 1; i < n; i++) { 2859ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 286e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2879ae82921SPaul Mullowney AiLo[i] = rowOffset; 2889ae82921SPaul Mullowney rowOffset += nz + 1; 2899ae82921SPaul Mullowney 2909566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 2919566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 2929ae82921SPaul Mullowney 2939ae82921SPaul Mullowney offset += nz; 2949ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 2959ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 2969ae82921SPaul Mullowney offset += 1; 2979ae82921SPaul Mullowney 2989ae82921SPaul Mullowney v += nz; 2999ae82921SPaul Mullowney vi += nz; 3009ae82921SPaul Mullowney } 3012205254eSKarl Rupp 302aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3039566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 304da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 305aa372e3fSPaul Mullowney /* Create the matrix description */ 3069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 310afb2bd1cSJunchao Zhang #else 3119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 312afb2bd1cSJunchao Zhang #endif 3139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 315aa372e3fSPaul Mullowney 316aa372e3fSPaul Mullowney /* set the operation */ 317aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 318aa372e3fSPaul Mullowney 319aa372e3fSPaul Mullowney /* set the matrix */ 320aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 321aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 322aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 323aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 324aa372e3fSPaul Mullowney 325aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 326aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 327aa372e3fSPaul Mullowney 328aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 329aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 330aa372e3fSPaul Mullowney 331aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 332aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 333aa372e3fSPaul Mullowney 334afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3359566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 336261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3371b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3389371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3399371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 3409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 341afb2bd1cSJunchao Zhang #endif 342afb2bd1cSJunchao Zhang 343aa372e3fSPaul Mullowney /* perform the solve analysis */ 3449371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3459371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 3461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3479371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 348d49cd2b7SBarry Smith #else 3495f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 350afb2bd1cSJunchao Zhang #endif 3519566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 3529566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 353aa372e3fSPaul Mullowney 354da79fbbcSStefano Zampini /* assign the pointer */ 355aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 3562cbc15d9SMark loTriFactor->AA_h = AALo; 3579566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 3589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 3599566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 360da79fbbcSStefano Zampini } else { /* update values only */ 36148a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 362da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 3632cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 364da79fbbcSStefano Zampini v = aa; 365da79fbbcSStefano Zampini vi = aj; 366da79fbbcSStefano Zampini offset = 1; 367da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 368da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 3699566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 370da79fbbcSStefano Zampini offset += nz; 3712cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 372da79fbbcSStefano Zampini offset += 1; 373da79fbbcSStefano Zampini v += nz; 374da79fbbcSStefano Zampini } 3752cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 3769566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 377da79fbbcSStefano Zampini } 3789371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3799ae82921SPaul Mullowney } 3809ae82921SPaul Mullowney PetscFunctionReturn(0); 3819ae82921SPaul Mullowney } 3829ae82921SPaul Mullowney 3839371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { 3849ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3859ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3869ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 387aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 3889ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 3899ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3909ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3919ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 3929ae82921SPaul Mullowney 3939ae82921SPaul Mullowney PetscFunctionBegin; 394cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 395c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3969ae82921SPaul Mullowney try { 3979ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3989ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 399da79fbbcSStefano Zampini if (!upTriFactor) { 4002cbc15d9SMark PetscScalar *AAUp; 4012cbc15d9SMark 4029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4032cbc15d9SMark 4049ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4079ae82921SPaul Mullowney 4089ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4099ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4109ae82921SPaul Mullowney AiUp[n] = nzUpper; 4119ae82921SPaul Mullowney offset = nzUpper; 4129ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4139ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4149ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4159ae82921SPaul Mullowney 416e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4179ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4189ae82921SPaul Mullowney 419e057df02SPaul Mullowney /* decrement the offset */ 4209ae82921SPaul Mullowney offset -= (nz + 1); 4219ae82921SPaul Mullowney 422e057df02SPaul Mullowney /* first, set the diagonal elements */ 4239ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 42409f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4259ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4269ae82921SPaul Mullowney 4279566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 4289566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 4299ae82921SPaul Mullowney } 4302205254eSKarl Rupp 431aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4329566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 433da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4342205254eSKarl Rupp 435aa372e3fSPaul Mullowney /* Create the matrix description */ 4369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 440afb2bd1cSJunchao Zhang #else 4419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 442afb2bd1cSJunchao Zhang #endif 4439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 445aa372e3fSPaul Mullowney 446aa372e3fSPaul Mullowney /* set the operation */ 447aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 448aa372e3fSPaul Mullowney 449aa372e3fSPaul Mullowney /* set the matrix */ 450aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 451aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 452aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 453aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 454aa372e3fSPaul Mullowney 455aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 456aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 457aa372e3fSPaul Mullowney 458aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 459aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 460aa372e3fSPaul Mullowney 461aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 462aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 463aa372e3fSPaul Mullowney 464afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4659566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 466261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 4671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4699371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 4709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 471afb2bd1cSJunchao Zhang #endif 472afb2bd1cSJunchao Zhang 473aa372e3fSPaul Mullowney /* perform the solve analysis */ 4749371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4759371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 4761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4779371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 478d49cd2b7SBarry Smith #else 4795f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 480afb2bd1cSJunchao Zhang #endif 4819566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4829566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 483aa372e3fSPaul Mullowney 484da79fbbcSStefano Zampini /* assign the pointer */ 485aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 4862cbc15d9SMark upTriFactor->AA_h = AAUp; 4879566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 4889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 4899566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 490da79fbbcSStefano Zampini } else { 49148a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 492da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 493da79fbbcSStefano Zampini offset = nzUpper; 494da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 495da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 496da79fbbcSStefano Zampini 497da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 498da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 499da79fbbcSStefano Zampini 500da79fbbcSStefano Zampini /* decrement the offset */ 501da79fbbcSStefano Zampini offset -= (nz + 1); 502da79fbbcSStefano Zampini 503da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5042cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 5059566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 506da79fbbcSStefano Zampini } 5072cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5089566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 509da79fbbcSStefano Zampini } 5109371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 5119ae82921SPaul Mullowney } 5129ae82921SPaul Mullowney PetscFunctionReturn(0); 5139ae82921SPaul Mullowney } 5149ae82921SPaul Mullowney 5159371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) { 5169ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5179ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 5189ae82921SPaul Mullowney IS isrow = a->row, iscol = a->icol; 5199ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5209ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5219ae82921SPaul Mullowney 5229ae82921SPaul Mullowney PetscFunctionBegin; 52328b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 5249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 5262205254eSKarl Rupp 527ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 528aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 5299ae82921SPaul Mullowney 530c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 531e057df02SPaul Mullowney /* lower triangular indices */ 5329566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 533da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 534da79fbbcSStefano Zampini const PetscInt *r; 535da79fbbcSStefano Zampini 5369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 537aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 538aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 5399566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 5409566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 541da79fbbcSStefano Zampini } 5429ae82921SPaul Mullowney 543e057df02SPaul Mullowney /* upper triangular indices */ 5449566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 545da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 546da79fbbcSStefano Zampini const PetscInt *c; 547da79fbbcSStefano Zampini 5489566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 549aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 550aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 5519566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 5529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 553da79fbbcSStefano Zampini } 5549ae82921SPaul Mullowney PetscFunctionReturn(0); 5559ae82921SPaul Mullowney } 5569ae82921SPaul Mullowney 5579371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) { 558087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 559087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 560aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 561aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 562087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 563087f3262SPaul Mullowney PetscScalar *AAUp; 564087f3262SPaul Mullowney PetscScalar *AALo; 565087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 566087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 567087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 568087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 569087f3262SPaul Mullowney 570087f3262SPaul Mullowney PetscFunctionBegin; 571cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 572c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 573087f3262SPaul Mullowney try { 5749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 5759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 576da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 577087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 5789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 5799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 580087f3262SPaul Mullowney 581087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 582087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 583087f3262SPaul Mullowney AiUp[n] = nzUpper; 584087f3262SPaul Mullowney offset = 0; 585087f3262SPaul Mullowney for (i = 0; i < n; i++) { 586087f3262SPaul Mullowney /* set the pointers */ 587087f3262SPaul Mullowney v = aa + ai[i]; 588087f3262SPaul Mullowney vj = aj + ai[i]; 589087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 590087f3262SPaul Mullowney 591087f3262SPaul Mullowney /* first, set the diagonal elements */ 592087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 59309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 594087f3262SPaul Mullowney AiUp[i] = offset; 59509f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 596087f3262SPaul Mullowney 597087f3262SPaul Mullowney offset += 1; 598087f3262SPaul Mullowney if (nz > 0) { 5999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 6009566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 601087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 602087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 603087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 604087f3262SPaul Mullowney } 605087f3262SPaul Mullowney offset += nz; 606087f3262SPaul Mullowney } 607087f3262SPaul Mullowney } 608087f3262SPaul Mullowney 609aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6109566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 611da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 612087f3262SPaul Mullowney 613aa372e3fSPaul Mullowney /* Create the matrix description */ 6149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 618afb2bd1cSJunchao Zhang #else 6199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 620afb2bd1cSJunchao Zhang #endif 6219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 623087f3262SPaul Mullowney 624aa372e3fSPaul Mullowney /* set the matrix */ 625aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 626aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 627aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 628aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 629aa372e3fSPaul Mullowney 630aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 631aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 632aa372e3fSPaul Mullowney 633aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 634aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 635aa372e3fSPaul Mullowney 636aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 637aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 638aa372e3fSPaul Mullowney 639afb2bd1cSJunchao Zhang /* set the operation */ 640afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 641afb2bd1cSJunchao Zhang 642afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6439566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 644261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 6451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6469371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6479371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 6489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 649afb2bd1cSJunchao Zhang #endif 650afb2bd1cSJunchao Zhang 651aa372e3fSPaul Mullowney /* perform the solve analysis */ 6529371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6539371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 6541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6559371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 656d49cd2b7SBarry Smith #else 6575f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 658afb2bd1cSJunchao Zhang #endif 6599566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 6609566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 661aa372e3fSPaul Mullowney 662da79fbbcSStefano Zampini /* assign the pointer */ 663aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 664aa372e3fSPaul Mullowney 665aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6669566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 667da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 668aa372e3fSPaul Mullowney 669aa372e3fSPaul Mullowney /* Create the matrix description */ 6709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 6719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 674afb2bd1cSJunchao Zhang #else 6759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 676afb2bd1cSJunchao Zhang #endif 6779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 679aa372e3fSPaul Mullowney 680aa372e3fSPaul Mullowney /* set the operation */ 681aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 682aa372e3fSPaul Mullowney 683aa372e3fSPaul Mullowney /* set the matrix */ 684aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 685aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 686aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 687aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 688aa372e3fSPaul Mullowney 689aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 690aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 691aa372e3fSPaul Mullowney 692aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 693aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 694aa372e3fSPaul Mullowney 695aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 696aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 697aa372e3fSPaul Mullowney 698afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6999566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 700261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 7011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 7029371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 7039371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 7049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 705afb2bd1cSJunchao Zhang #endif 706afb2bd1cSJunchao Zhang 707aa372e3fSPaul Mullowney /* perform the solve analysis */ 7089371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 7099371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 7101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 7119371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 712d49cd2b7SBarry Smith #else 7135f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 714afb2bd1cSJunchao Zhang #endif 7159566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7169566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 717aa372e3fSPaul Mullowney 718da79fbbcSStefano Zampini /* assign the pointer */ 719aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 720087f3262SPaul Mullowney 7219566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 7229566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 7239566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 724da79fbbcSStefano Zampini } else { 725da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 726da79fbbcSStefano Zampini offset = 0; 727da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 728da79fbbcSStefano Zampini /* set the pointers */ 729da79fbbcSStefano Zampini v = aa + ai[i]; 730da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 731da79fbbcSStefano Zampini 732da79fbbcSStefano Zampini /* first, set the diagonal elements */ 733da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 734da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 735da79fbbcSStefano Zampini 736da79fbbcSStefano Zampini offset += 1; 737da79fbbcSStefano Zampini if (nz > 0) { 7389566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 739da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 740da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 741da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 742da79fbbcSStefano Zampini } 743da79fbbcSStefano Zampini offset += nz; 744da79fbbcSStefano Zampini } 745da79fbbcSStefano Zampini } 74628b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 74728b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 748da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 749da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 7509566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 751da79fbbcSStefano Zampini } 7529566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 7539566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 7549371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 755087f3262SPaul Mullowney } 756087f3262SPaul Mullowney PetscFunctionReturn(0); 757087f3262SPaul Mullowney } 758087f3262SPaul Mullowney 7599371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { 760087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 761087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 762087f3262SPaul Mullowney IS ip = a->row; 763087f3262SPaul Mullowney PetscBool perm_identity; 764087f3262SPaul Mullowney PetscInt n = A->rmap->n; 765087f3262SPaul Mullowney 766087f3262SPaul Mullowney PetscFunctionBegin; 76728b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 7689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 769ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 770aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 771aa372e3fSPaul Mullowney 772da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 773da79fbbcSStefano Zampini 774087f3262SPaul Mullowney /* lower triangular indices */ 7759566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 776087f3262SPaul Mullowney if (!perm_identity) { 7774e4bbfaaSStefano Zampini IS iip; 778da79fbbcSStefano Zampini const PetscInt *irip, *rip; 7794e4bbfaaSStefano Zampini 7809566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 7819566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 7829566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 783aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 784aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 785aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 7864e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 7879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 7889566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 7899566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 7909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 791da79fbbcSStefano Zampini } 792087f3262SPaul Mullowney PetscFunctionReturn(0); 793087f3262SPaul Mullowney } 794087f3262SPaul Mullowney 7959371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 796087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 797087f3262SPaul Mullowney IS ip = b->row; 798087f3262SPaul Mullowney PetscBool perm_identity; 799087f3262SPaul Mullowney 800087f3262SPaul Mullowney PetscFunctionBegin; 8019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 8029566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 803ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 804087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8059566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 806087f3262SPaul Mullowney if (perm_identity) { 807087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 808087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8094e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8104e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 811087f3262SPaul Mullowney } else { 812087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 813087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8144e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8154e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 816087f3262SPaul Mullowney } 817087f3262SPaul Mullowney 818087f3262SPaul Mullowney /* get the triangular factors */ 8199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 820087f3262SPaul Mullowney PetscFunctionReturn(0); 821087f3262SPaul Mullowney } 8229ae82921SPaul Mullowney 8239371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) { 824bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 825aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 826aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 827da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 828da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 829aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 830aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 831aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 832aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 833b175d8bbSPaul Mullowney 834bda325fcSPaul Mullowney PetscFunctionBegin; 835aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 8369566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 837da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 838aa372e3fSPaul Mullowney 839aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 840aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 841aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 8429371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 843aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 844aa372e3fSPaul Mullowney 845aa372e3fSPaul Mullowney /* Create the matrix description */ 8469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 8479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 8489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 8499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 8509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 851aa372e3fSPaul Mullowney 852aa372e3fSPaul Mullowney /* set the operation */ 853aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 854aa372e3fSPaul Mullowney 855aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 856aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 857afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 858afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 859aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 860afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 861afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 862afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 863aa372e3fSPaul Mullowney 864aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 865afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 8669371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 8679371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 8689371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 8699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 870afb2bd1cSJunchao Zhang #endif 871afb2bd1cSJunchao Zhang 8729566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 8739371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 8749371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 8769371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 877afb2bd1cSJunchao Zhang #else 8789371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 879afb2bd1cSJunchao Zhang #endif 8809566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8819566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 882aa372e3fSPaul Mullowney 883afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8849566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 885261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 8861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8879371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 8889371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 8899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 890afb2bd1cSJunchao Zhang #endif 891afb2bd1cSJunchao Zhang 892afb2bd1cSJunchao Zhang /* perform the solve analysis */ 8939371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 8949371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 8951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8969371c9d4SSatish Balay loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 897d49cd2b7SBarry Smith #else 8985f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 899afb2bd1cSJunchao Zhang #endif 9009566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9019566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 902aa372e3fSPaul Mullowney 903da79fbbcSStefano Zampini /* assign the pointer */ 904aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 905aa372e3fSPaul Mullowney 906aa372e3fSPaul Mullowney /*********************************************/ 907aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 908aa372e3fSPaul Mullowney /*********************************************/ 909aa372e3fSPaul Mullowney 910aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 9119566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 912da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 913aa372e3fSPaul Mullowney 914aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 915aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 916aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 9179371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 918aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 919aa372e3fSPaul Mullowney 920aa372e3fSPaul Mullowney /* Create the matrix description */ 9219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 9229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 9239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 9249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 9259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 926aa372e3fSPaul Mullowney 927aa372e3fSPaul Mullowney /* set the operation */ 928aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 929aa372e3fSPaul Mullowney 930aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 931aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 932afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 933afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 934aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 935afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 936afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 937afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 938aa372e3fSPaul Mullowney 939aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 940afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9419371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 9429371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 9439371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 9449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 945afb2bd1cSJunchao Zhang #endif 946afb2bd1cSJunchao Zhang 9479566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 9489371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 9499371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 950afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9519371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 952afb2bd1cSJunchao Zhang #else 9539371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 954afb2bd1cSJunchao Zhang #endif 955d49cd2b7SBarry Smith 9569566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9579566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 958aa372e3fSPaul Mullowney 959afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 961261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 9621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9649371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 9659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 966afb2bd1cSJunchao Zhang #endif 967afb2bd1cSJunchao Zhang 968afb2bd1cSJunchao Zhang /* perform the solve analysis */ 9695f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 9709371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9719371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 9721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9739371c9d4SSatish Balay upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 974d49cd2b7SBarry Smith #else 9755f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 976afb2bd1cSJunchao Zhang #endif 977d49cd2b7SBarry Smith 9789566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9799566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 980aa372e3fSPaul Mullowney 981da79fbbcSStefano Zampini /* assign the pointer */ 982aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 983bda325fcSPaul Mullowney PetscFunctionReturn(0); 984bda325fcSPaul Mullowney } 985bda325fcSPaul Mullowney 9869371c9d4SSatish Balay struct PetscScalarToPetscInt { 9879371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 988a49f1ed0SStefano Zampini }; 989a49f1ed0SStefano Zampini 9909371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) { 991aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 992a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 993bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 994bda325fcSPaul Mullowney cusparseStatus_t stat; 995aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 996b175d8bbSPaul Mullowney 997bda325fcSPaul Mullowney PetscFunctionBegin; 9989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 999a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 100028b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1001a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 100208401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 10031a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 10049566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 100648a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1007a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1008aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 10099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1010aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 10119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 10129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1013aa372e3fSPaul Mullowney 1014b06137fdSPaul Mullowney /* set alpha and beta */ 10159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 10169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 10179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 10189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1021b06137fdSPaul Mullowney 1022aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1023aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1024a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1025554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1026554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1027aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1028a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1029aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1030aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1031a3fdcf43SKarl Rupp 1032ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 103381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1034afb2bd1cSJunchao Zhang 1035afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10363606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 10379371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 10389371c9d4SSatish Balay indexBase, cusparse_scalartype); 10399371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10403606e59fSJunchao Zhang #else 10413606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 10423606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 10433606e59fSJunchao Zhang 10443606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 10453606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 10463606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 10473606e59fSJunchao Zhang */ 10483606e59fSJunchao Zhang if (matrixT->num_entries) { 10499371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 10509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10513606e59fSJunchao Zhang 10523606e59fSJunchao Zhang } else { 10533606e59fSJunchao Zhang matstructT->matDescr = NULL; 10543606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 10553606e59fSJunchao Zhang } 10563606e59fSJunchao Zhang #endif 1057afb2bd1cSJunchao Zhang #endif 1058aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1059afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1060afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1061afb2bd1cSJunchao Zhang #else 1062aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 106351c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 106451c6d536SStefano Zampini /* First convert HYB to CSR */ 1065aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1066aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1067aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1068aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1069aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1070aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1071aa372e3fSPaul Mullowney 10729371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 10739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1074aa372e3fSPaul Mullowney 1075aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1076aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1077aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1078aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1079aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1080aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1081aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1082aa372e3fSPaul Mullowney 10839371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 10849371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 10859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1086aa372e3fSPaul Mullowney 1087aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1088aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 10899566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 10909371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 10919371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 10929371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1093aa372e3fSPaul Mullowney 1094aa372e3fSPaul Mullowney /* assign the pointer */ 1095aa372e3fSPaul Mullowney matstructT->mat = hybMat; 10961a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1097aa372e3fSPaul Mullowney /* delete temporaries */ 1098aa372e3fSPaul Mullowney if (tempT) { 1099aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1100aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1101aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1102aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1103087f3262SPaul Mullowney } 1104aa372e3fSPaul Mullowney if (temp) { 1105aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1106aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1107aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1108aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1109aa372e3fSPaul Mullowney } 1110afb2bd1cSJunchao Zhang #endif 1111aa372e3fSPaul Mullowney } 1112a49f1ed0SStefano Zampini } 1113a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1114a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1115a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 111628b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 111728b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 111828b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 111928b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 112028b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 112128b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 112228b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 112328b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1124a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1125a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1126a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 11279566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1128a49f1ed0SStefano Zampini } 1129a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1130a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1131792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1132a49f1ed0SStefano Zampini 1133a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1134a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1135a49f1ed0SStefano Zampini void *csr2cscBuffer; 1136a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 11379371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 11389371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 11399371c9d4SSatish Balay PetscCallCUSPARSE(stat); 11409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1141a49f1ed0SStefano Zampini #endif 1142a49f1ed0SStefano Zampini 11431a2c6b5cSJunchao Zhang if (matrix->num_entries) { 11441a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 11451a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 11461a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 11471a2c6b5cSJunchao Zhang 11481a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 11491a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 11501a2c6b5cSJunchao Zhang */ 11519371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1152a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11539371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 11549371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1155a49f1ed0SStefano Zampini #else 11569371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 11579371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1158a49f1ed0SStefano Zampini #endif 11591a2c6b5cSJunchao Zhang } else { 11601a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 11611a2c6b5cSJunchao Zhang } 11621a2c6b5cSJunchao Zhang 1163a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1164792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1165a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11669566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1167a49f1ed0SStefano Zampini #endif 1168a49f1ed0SStefano Zampini } 11699371c9d4SSatish Balay PetscCallThrust( 11709371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1171a49f1ed0SStefano Zampini } 11729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 11739566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1174213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1175213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1176aa372e3fSPaul Mullowney /* assign the pointer */ 1177aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 11781a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1179bda325fcSPaul Mullowney PetscFunctionReturn(0); 1180bda325fcSPaul Mullowney } 1181bda325fcSPaul Mullowney 1182a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 11839371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1184c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1185465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1186465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1187465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1188465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1189bda325fcSPaul Mullowney cusparseStatus_t stat; 1190bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1191aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1192aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1193aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1194bda325fcSPaul Mullowney 1195bda325fcSPaul Mullowney PetscFunctionBegin; 1196aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1197aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 11989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1199aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1200aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1201bda325fcSPaul Mullowney } 1202bda325fcSPaul Mullowney 1203bda325fcSPaul Mullowney /* Get the GPU pointers */ 12049566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 12059566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1206c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1207c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1208bda325fcSPaul Mullowney 12099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1210aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 12119371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1212aa372e3fSPaul Mullowney 1213aa372e3fSPaul Mullowney /* First, solve U */ 12149371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 12151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1216afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1217afb2bd1cSJunchao Zhang #endif 12189371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 12191b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12209371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 12219371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1222d49cd2b7SBarry Smith #else 12239371c9d4SSatish Balay tempGPU->data().get()); 12249371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1225afb2bd1cSJunchao Zhang #endif 1226aa372e3fSPaul Mullowney 1227aa372e3fSPaul Mullowney /* Then, solve L */ 12289371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 12291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1230afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1231afb2bd1cSJunchao Zhang #endif 12329371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1233d49cd2b7SBarry Smith tempGPU->data().get(), 12341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12359371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 12369371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1237d49cd2b7SBarry Smith #else 12389371c9d4SSatish Balay xarray); 12399371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1240afb2bd1cSJunchao Zhang #endif 1241aa372e3fSPaul Mullowney 1242aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 12439371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1244aa372e3fSPaul Mullowney 1245aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1246a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1247bda325fcSPaul Mullowney 1248bda325fcSPaul Mullowney /* restore */ 12499566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 12509566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 12519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1253bda325fcSPaul Mullowney PetscFunctionReturn(0); 1254bda325fcSPaul Mullowney } 1255bda325fcSPaul Mullowney 12569371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1257465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1258465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1259bda325fcSPaul Mullowney cusparseStatus_t stat; 1260bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1261aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1262aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1263aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1264bda325fcSPaul Mullowney 1265bda325fcSPaul Mullowney PetscFunctionBegin; 1266aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1267aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 12689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1269aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1270aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1271bda325fcSPaul Mullowney } 1272bda325fcSPaul Mullowney 1273bda325fcSPaul Mullowney /* Get the GPU pointers */ 12749566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 12759566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1276bda325fcSPaul Mullowney 12779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1278aa372e3fSPaul Mullowney /* First, solve U */ 12799371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 12801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1281afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1282afb2bd1cSJunchao Zhang #endif 12839371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 12841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12859371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 12869371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1287d49cd2b7SBarry Smith #else 12889371c9d4SSatish Balay tempGPU->data().get()); 12899371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1290afb2bd1cSJunchao Zhang #endif 1291aa372e3fSPaul Mullowney 1292aa372e3fSPaul Mullowney /* Then, solve L */ 12939371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 12941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1295afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1296afb2bd1cSJunchao Zhang #endif 12979371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1298d49cd2b7SBarry Smith tempGPU->data().get(), 12991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13009371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 13019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1302d49cd2b7SBarry Smith #else 13039371c9d4SSatish Balay xarray); 13049371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1305afb2bd1cSJunchao Zhang #endif 1306bda325fcSPaul Mullowney 1307bda325fcSPaul Mullowney /* restore */ 13089566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 13099566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 13109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1312bda325fcSPaul Mullowney PetscFunctionReturn(0); 1313bda325fcSPaul Mullowney } 1314bda325fcSPaul Mullowney 13159371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1316465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1317465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1318465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1319465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 13209ae82921SPaul Mullowney cusparseStatus_t stat; 13219ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1322aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1323aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1324aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 13259ae82921SPaul Mullowney 13269ae82921SPaul Mullowney PetscFunctionBegin; 1327ebc8f436SDominic Meiser 1328e057df02SPaul Mullowney /* Get the GPU pointers */ 13299566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13309566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1331c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1332c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 13339ae82921SPaul Mullowney 13349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1335aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 13369371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1337aa372e3fSPaul Mullowney 1338aa372e3fSPaul Mullowney /* Next, solve L */ 13399371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 13401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1341afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1342afb2bd1cSJunchao Zhang #endif 13439371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1344d49cd2b7SBarry Smith tempGPU->data().get(), 13451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13469371c9d4SSatish Balay xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 13479371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1348d49cd2b7SBarry Smith #else 13499371c9d4SSatish Balay xarray); 13509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1351afb2bd1cSJunchao Zhang #endif 1352aa372e3fSPaul Mullowney 1353aa372e3fSPaul Mullowney /* Then, solve U */ 13549371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 13551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1356afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1357afb2bd1cSJunchao Zhang #endif 13589371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 13591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13609371c9d4SSatish Balay tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 13619371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1362d49cd2b7SBarry Smith #else 13639371c9d4SSatish Balay tempGPU->data().get()); 13649371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1365afb2bd1cSJunchao Zhang #endif 1366d49cd2b7SBarry Smith 13674e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 13689371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 13699ae82921SPaul Mullowney 13709566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 13719566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 13729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 13749ae82921SPaul Mullowney PetscFunctionReturn(0); 13759ae82921SPaul Mullowney } 13769ae82921SPaul Mullowney 13779371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1378465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1379465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 13809ae82921SPaul Mullowney cusparseStatus_t stat; 13819ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1382aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1383aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1384aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 13859ae82921SPaul Mullowney 13869ae82921SPaul Mullowney PetscFunctionBegin; 1387e057df02SPaul Mullowney /* Get the GPU pointers */ 13889566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13899566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 13909ae82921SPaul Mullowney 13919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1392aa372e3fSPaul Mullowney /* First, solve L */ 13939371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 13941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1395afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1396afb2bd1cSJunchao Zhang #endif 13979371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 13981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13999371c9d4SSatish Balay tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 14009371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1401d49cd2b7SBarry Smith #else 14029371c9d4SSatish Balay tempGPU->data().get()); 14039371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1404afb2bd1cSJunchao Zhang #endif 1405d49cd2b7SBarry Smith 1406aa372e3fSPaul Mullowney /* Next, solve U */ 14079371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 14081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1409afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1410afb2bd1cSJunchao Zhang #endif 14119371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1412d49cd2b7SBarry Smith tempGPU->data().get(), 14131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 14149371c9d4SSatish Balay xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 14159371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1416d49cd2b7SBarry Smith #else 14179371c9d4SSatish Balay xarray); 14189371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1419afb2bd1cSJunchao Zhang #endif 14209ae82921SPaul Mullowney 14219566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 14229566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 14239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 14259ae82921SPaul Mullowney PetscFunctionReturn(0); 14269ae82921SPaul Mullowney } 14279ae82921SPaul Mullowney 1428da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1429da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 14309371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1431da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1432da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1433da112707SJunchao Zhang const PetscScalar *barray; 1434da112707SJunchao Zhang PetscScalar *xarray; 1435da112707SJunchao Zhang 1436da112707SJunchao Zhang PetscFunctionBegin; 1437da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1438da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1439da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1440da112707SJunchao Zhang 1441da112707SJunchao Zhang /* Solve L*y = b */ 1442da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1443da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 14449371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 14459371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 144612ba2bc6SJunchao Zhang fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1447da112707SJunchao Zhang 1448da112707SJunchao Zhang /* Solve U*x = y */ 1449da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 14509371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 14519371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1452da112707SJunchao Zhang 1453da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1454da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1455da112707SJunchao Zhang 1456da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1457da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1458da112707SJunchao Zhang PetscFunctionReturn(0); 1459da112707SJunchao Zhang } 1460da112707SJunchao Zhang 14619371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1462da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1463da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1464da112707SJunchao Zhang const PetscScalar *barray; 1465da112707SJunchao Zhang PetscScalar *xarray; 1466da112707SJunchao Zhang 1467da112707SJunchao Zhang PetscFunctionBegin; 146812ba2bc6SJunchao Zhang if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1469da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 14709371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 14719371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1472da112707SJunchao Zhang 1473da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 14749371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1475da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 147612ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 147712ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 147812ba2bc6SJunchao Zhang } 1479da112707SJunchao Zhang 148012ba2bc6SJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 14819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1482da112707SJunchao Zhang 14839371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 148412ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1485da112707SJunchao Zhang } 1486da112707SJunchao Zhang 1487da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1488da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1489da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1490da112707SJunchao Zhang 1491da112707SJunchao Zhang /* Solve Ut*y = b */ 1492da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1493da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 14949371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 14959371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1496da112707SJunchao Zhang 1497da112707SJunchao Zhang /* Solve Lt*x = y */ 1498da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 14999371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 15009371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1501da112707SJunchao Zhang 1502da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1503da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1504da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1505da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1506da112707SJunchao Zhang PetscFunctionReturn(0); 1507da112707SJunchao Zhang } 1508da112707SJunchao Zhang 15099371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) { 1510da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1511da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1512da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1513da112707SJunchao Zhang CsrMatrix *Acsr; 1514da112707SJunchao Zhang PetscInt m, nz; 1515da112707SJunchao Zhang PetscBool flg; 1516da112707SJunchao Zhang 1517da112707SJunchao Zhang PetscFunctionBegin; 1518da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1519da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1520da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1521da112707SJunchao Zhang } 1522da112707SJunchao Zhang 1523da112707SJunchao Zhang /* Copy A's value to fact */ 1524da112707SJunchao Zhang m = fact->rmap->n; 1525da112707SJunchao Zhang nz = aij->nz; 1526da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1527da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1528da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1529da112707SJunchao Zhang 1530da112707SJunchao Zhang /* Factorize fact inplace */ 15319371c9d4SSatish Balay if (m) 15329371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 15339371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1534da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1535da112707SJunchao Zhang int numerical_zero; 1536da112707SJunchao Zhang cusparseStatus_t status; 1537da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1538da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1539da112707SJunchao Zhang } 1540da112707SJunchao Zhang 154112ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 154212ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 154312ba2bc6SJunchao Zhang */ 15449371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1545da112707SJunchao Zhang 15469371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1547da112707SJunchao Zhang 154812ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 154912ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 155012ba2bc6SJunchao Zhang 1551da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1552da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1553da112707SJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1554da112707SJunchao Zhang fact->ops->matsolve = NULL; 1555da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1556da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1557da112707SJunchao Zhang PetscFunctionReturn(0); 1558da112707SJunchao Zhang } 1559da112707SJunchao Zhang 15609371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1561da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1562da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1563da112707SJunchao Zhang PetscInt m, nz; 1564da112707SJunchao Zhang 1565da112707SJunchao Zhang PetscFunctionBegin; 1566da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1567da112707SJunchao Zhang PetscInt i; 1568da112707SJunchao Zhang PetscBool flg, missing; 1569da112707SJunchao Zhang 1570da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1571da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1572da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1573da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1574da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1575da112707SJunchao Zhang } 1576da112707SJunchao Zhang 1577da112707SJunchao Zhang /* Free the old stale stuff */ 1578da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1579da112707SJunchao Zhang 1580da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1581da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1582da112707SJunchao Zhang */ 1583da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1584da112707SJunchao Zhang 1585da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1586da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1587da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1588da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1589da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1590da112707SJunchao Zhang 1591da112707SJunchao Zhang aij->row = NULL; 1592da112707SJunchao Zhang aij->col = NULL; 1593da112707SJunchao Zhang 1594da112707SJunchao Zhang /* ====================================================================== */ 1595da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1596da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1597da112707SJunchao Zhang /* ====================================================================== */ 1598da112707SJunchao Zhang const int *Ai, *Aj; 1599da112707SJunchao Zhang 1600da112707SJunchao Zhang m = fact->rmap->n; 1601da112707SJunchao Zhang nz = aij->nz; 1602da112707SJunchao Zhang 1603da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1604da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1605da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1606da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1607da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1608da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1609da112707SJunchao Zhang 1610da112707SJunchao Zhang /* ====================================================================== */ 1611da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1612da112707SJunchao Zhang /* ====================================================================== */ 1613da112707SJunchao Zhang cusparseFillMode_t fillMode; 1614da112707SJunchao Zhang cusparseDiagType_t diagType; 1615da112707SJunchao Zhang 1616da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1617da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1618da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1619da112707SJunchao Zhang 1620da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1621da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1622da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1623da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1624da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1625da112707SJunchao Zhang */ 1626da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1627da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 16289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16299371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1631da112707SJunchao Zhang 1632da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1633da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 16349371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16359371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16369371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1637da112707SJunchao Zhang 1638da112707SJunchao Zhang /* ========================================================================= */ 1639da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1640da112707SJunchao Zhang /* ========================================================================= */ 1641da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 16429371c9d4SSatish Balay if (m) 16439371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 16449371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1645da112707SJunchao Zhang 1646da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1647da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1648da112707SJunchao Zhang 1649da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1650da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1651da112707SJunchao Zhang 1652da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 16539371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1654da112707SJunchao Zhang 1655da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 16569371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1657da112707SJunchao Zhang 1658da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 165912ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 166012ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 166112ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1662da112707SJunchao Zhang */ 166312ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 166412ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 166512ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1666da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 166712ba2bc6SJunchao Zhang } else { 166812ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 166912ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1670da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 167112ba2bc6SJunchao Zhang } 1672da112707SJunchao Zhang 1673da112707SJunchao Zhang /* ========================================================================== */ 1674da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1675da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1676da112707SJunchao Zhang /* ========================================================================== */ 1677da112707SJunchao Zhang int structural_zero; 1678da112707SJunchao Zhang cusparseStatus_t status; 1679da112707SJunchao Zhang 1680da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 16819371c9d4SSatish Balay if (m) 16829371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 16839371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1684da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1685da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1686da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1687da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1688da112707SJunchao Zhang } 1689da112707SJunchao Zhang 1690da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 16910dd8c0acSJunchao Zhang { 1692da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 16930dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1694da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1695da112707SJunchao Zhang 1696da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1697da112707SJunchao Zhang Ai = Aseq->i; 1698da112707SJunchao Zhang Adiag = Aseq->diag; 1699da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1700da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1701da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1702da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1703da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1704da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1705da112707SJunchao Zhang */ 1706da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1707da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1708da112707SJunchao Zhang } 1709da112707SJunchao Zhang } 1710da112707SJunchao Zhang fs->numericFactFlops = flops; 17110dd8c0acSJunchao Zhang } 1712da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1713da112707SJunchao Zhang PetscFunctionReturn(0); 1714da112707SJunchao Zhang } 1715da112707SJunchao Zhang 17169371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) { 1717da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1718da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1719da112707SJunchao Zhang const PetscScalar *barray; 1720da112707SJunchao Zhang PetscScalar *xarray; 1721da112707SJunchao Zhang 1722da112707SJunchao Zhang PetscFunctionBegin; 1723da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1724da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1725da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1726da112707SJunchao Zhang 1727da112707SJunchao Zhang /* Solve L*y = b */ 1728da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1729da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 17309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 17319371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1732da112707SJunchao Zhang 1733da112707SJunchao Zhang /* Solve Lt*x = y */ 1734da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 17359371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 17369371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1737da112707SJunchao Zhang 1738da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1739da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1740da112707SJunchao Zhang 1741da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1742da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1743da112707SJunchao Zhang PetscFunctionReturn(0); 1744da112707SJunchao Zhang } 1745da112707SJunchao Zhang 17469371c9d4SSatish Balay static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) { 1747da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1748da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1749da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1750da112707SJunchao Zhang CsrMatrix *Acsr; 1751da112707SJunchao Zhang PetscInt m, nz; 1752da112707SJunchao Zhang PetscBool flg; 1753da112707SJunchao Zhang 1754da112707SJunchao Zhang PetscFunctionBegin; 1755da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1756da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1757da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1758da112707SJunchao Zhang } 1759da112707SJunchao Zhang 1760da112707SJunchao Zhang /* Copy A's value to fact */ 1761da112707SJunchao Zhang m = fact->rmap->n; 1762da112707SJunchao Zhang nz = aij->nz; 1763da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1764da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1765da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1766da112707SJunchao Zhang 1767da112707SJunchao Zhang /* Factorize fact inplace */ 1768da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1769da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1770da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1771da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1772da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1773da112707SJunchao Zhang */ 17749371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1775da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1776da112707SJunchao Zhang int numerical_zero; 1777da112707SJunchao Zhang cusparseStatus_t status; 1778da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1779da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1780da112707SJunchao Zhang } 1781da112707SJunchao Zhang 17829371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1783da112707SJunchao Zhang 1784da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1785da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1786da112707SJunchao Zhang */ 17879371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1788da112707SJunchao Zhang 1789da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1790da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1791da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1792da112707SJunchao Zhang fact->ops->matsolve = NULL; 1793da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1794da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1795da112707SJunchao Zhang PetscFunctionReturn(0); 1796da112707SJunchao Zhang } 1797da112707SJunchao Zhang 17989371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1799da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1800da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1801da112707SJunchao Zhang PetscInt m, nz; 1802da112707SJunchao Zhang 1803da112707SJunchao Zhang PetscFunctionBegin; 1804da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1805da112707SJunchao Zhang PetscInt i; 1806da112707SJunchao Zhang PetscBool flg, missing; 1807da112707SJunchao Zhang 1808da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1809da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1810da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1811da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1812da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1813da112707SJunchao Zhang } 1814da112707SJunchao Zhang 1815da112707SJunchao Zhang /* Free the old stale stuff */ 1816da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1817da112707SJunchao Zhang 1818da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1819da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1820da112707SJunchao Zhang */ 1821da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1822da112707SJunchao Zhang 1823da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1824da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 1825da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1826da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1827da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1828da112707SJunchao Zhang 1829da112707SJunchao Zhang aij->row = NULL; 1830da112707SJunchao Zhang aij->col = NULL; 1831da112707SJunchao Zhang 1832da112707SJunchao Zhang /* ====================================================================== */ 1833da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1834da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1835da112707SJunchao Zhang /* ====================================================================== */ 1836da112707SJunchao Zhang const int *Ai, *Aj; 1837da112707SJunchao Zhang 1838da112707SJunchao Zhang m = fact->rmap->n; 1839da112707SJunchao Zhang nz = aij->nz; 1840da112707SJunchao Zhang 1841da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1842da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1843da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1844da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1845da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1846da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1847da112707SJunchao Zhang 1848da112707SJunchao Zhang /* ====================================================================== */ 1849da112707SJunchao Zhang /* Create mat descriptors for M, L */ 1850da112707SJunchao Zhang /* ====================================================================== */ 1851da112707SJunchao Zhang cusparseFillMode_t fillMode; 1852da112707SJunchao Zhang cusparseDiagType_t diagType; 1853da112707SJunchao Zhang 1854da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1855da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1856da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1857da112707SJunchao Zhang 1858da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1859da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1860da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1861da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1862da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1863da112707SJunchao Zhang */ 1864da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1865da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 18669371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18679371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1869da112707SJunchao Zhang 1870da112707SJunchao Zhang /* ========================================================================= */ 1871da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1872da112707SJunchao Zhang /* ========================================================================= */ 1873da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 18749371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1875da112707SJunchao Zhang 1876da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1877da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1878da112707SJunchao Zhang 1879da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1880da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1881da112707SJunchao Zhang 1882da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18839371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1884da112707SJunchao Zhang 1885da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 18869371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1887da112707SJunchao Zhang 188812ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 188912ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 189012ba2bc6SJunchao Zhang */ 189112ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 189212ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 189312ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1894da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 189512ba2bc6SJunchao Zhang } else { 189612ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 189712ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 189812ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 189912ba2bc6SJunchao Zhang } 1900da112707SJunchao Zhang 1901da112707SJunchao Zhang /* ========================================================================== */ 1902da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 1903da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 1904da112707SJunchao Zhang /* ========================================================================== */ 1905da112707SJunchao Zhang int structural_zero; 1906da112707SJunchao Zhang cusparseStatus_t status; 1907da112707SJunchao Zhang 1908da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 19099371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1910da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1911da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1912da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1913da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1914da112707SJunchao Zhang } 1915da112707SJunchao Zhang 1916da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 19170dd8c0acSJunchao Zhang { 1918da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 19190dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 1920da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1921da112707SJunchao Zhang 1922da112707SJunchao Zhang Ai = Aseq->i; 1923da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1924da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1925da112707SJunchao Zhang if (nzRow > 1) { 1926da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1927da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1928da112707SJunchao Zhang */ 1929da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1930da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1931da112707SJunchao Zhang } 1932da112707SJunchao Zhang } 1933da112707SJunchao Zhang fs->numericFactFlops = flops; 19340dd8c0acSJunchao Zhang } 1935da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1936da112707SJunchao Zhang PetscFunctionReturn(0); 1937da112707SJunchao Zhang } 1938da112707SJunchao Zhang #endif 1939da112707SJunchao Zhang 19409371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1941da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1942da112707SJunchao Zhang 1943da112707SJunchao Zhang PetscFunctionBegin; 1944da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1945bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1946bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) { 1947da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 1948da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 1949bc996fdcSJunchao Zhang } 1950da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 1951da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1952da112707SJunchao Zhang } else 1953da112707SJunchao Zhang #endif 1954da112707SJunchao Zhang { 1955da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1956da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1957da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1958da112707SJunchao Zhang } 1959da112707SJunchao Zhang PetscFunctionReturn(0); 1960da112707SJunchao Zhang } 1961da112707SJunchao Zhang 19629371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1963da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1964da112707SJunchao Zhang 1965da112707SJunchao Zhang PetscFunctionBegin; 1966da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1967da112707SJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1968da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1969da112707SJunchao Zhang PetscFunctionReturn(0); 1970da112707SJunchao Zhang } 1971da112707SJunchao Zhang 19729371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1973da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1974da112707SJunchao Zhang 1975da112707SJunchao Zhang PetscFunctionBegin; 1976da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1977bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 1978bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1979da112707SJunchao Zhang if (!info->levels && perm_identity) { 1980da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1981da112707SJunchao Zhang } else 1982da112707SJunchao Zhang #endif 1983da112707SJunchao Zhang { 1984da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1985da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1986da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1987da112707SJunchao Zhang } 1988da112707SJunchao Zhang PetscFunctionReturn(0); 1989da112707SJunchao Zhang } 1990da112707SJunchao Zhang 19919371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1992da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1993da112707SJunchao Zhang 1994da112707SJunchao Zhang PetscFunctionBegin; 1995da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1996da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1997da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1998da112707SJunchao Zhang PetscFunctionReturn(0); 1999da112707SJunchao Zhang } 2000da112707SJunchao Zhang 20019371c9d4SSatish Balay PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) { 2002841d4cb1SJunchao Zhang PetscFunctionBegin; 2003841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 2004841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2005841d4cb1SJunchao Zhang } 2006841d4cb1SJunchao Zhang 2007841d4cb1SJunchao Zhang /*MC 2008841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 200911a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2010841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2011841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 201211a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2013841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2014841d4cb1SJunchao Zhang 2015841d4cb1SJunchao Zhang Level: beginner 2016841d4cb1SJunchao Zhang 201711a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2018841d4cb1SJunchao Zhang M*/ 2019841d4cb1SJunchao Zhang 20209371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) { 2021841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2022bc996fdcSJunchao Zhang PetscBool factOnDevice, factOnHost; 2023bc996fdcSJunchao Zhang char *prefix; 2024bc996fdcSJunchao Zhang char factPlace[32] = "device"; /* the default */ 2025841d4cb1SJunchao Zhang 2026841d4cb1SJunchao Zhang PetscFunctionBegin; 2027841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2028841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2029841d4cb1SJunchao Zhang (*B)->factortype = ftype; 2030841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2031841d4cb1SJunchao Zhang 2032bc996fdcSJunchao Zhang prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2033bc996fdcSJunchao Zhang PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2034bc996fdcSJunchao Zhang PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2035bc996fdcSJunchao Zhang PetscOptionsEnd(); 2036bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2037bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2038bc996fdcSJunchao Zhang PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2039bc996fdcSJunchao Zhang ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2040bc996fdcSJunchao Zhang 2041841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2042841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2043841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2044841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2045841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2046841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2047841d4cb1SJunchao Zhang } else { 2048841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2049841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2050841d4cb1SJunchao Zhang } 2051841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2052841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2053841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2054841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2055841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2056841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2057841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2058841d4cb1SJunchao Zhang } else { 2059841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2060841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2061841d4cb1SJunchao Zhang } 2062841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2063841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2064841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2065841d4cb1SJunchao Zhang 2066841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2067841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2068841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2069841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2070841d4cb1SJunchao Zhang } 2071841d4cb1SJunchao Zhang 20729371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) { 20737e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 20747e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 20750dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500 2076da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 20770dd8c0acSJunchao Zhang #endif 20787e8381f9SStefano Zampini 20797e8381f9SStefano Zampini PetscFunctionBegin; 20807e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 20819566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2082da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2083da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 20849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2085da112707SJunchao Zhang } 2086da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500 2087da112707SJunchao Zhang else if (fs->csrVal) { 2088da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2089da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2090da112707SJunchao Zhang } 2091da112707SJunchao Zhang #endif 20929371c9d4SSatish Balay else 20939371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 20949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 20959566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 20967e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 20977e8381f9SStefano Zampini } 20987e8381f9SStefano Zampini PetscFunctionReturn(0); 20997e8381f9SStefano Zampini } 21007e8381f9SStefano Zampini 21019371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 21027e8381f9SStefano Zampini PetscFunctionBegin; 21039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 210467a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 210567a45760SJunchao Zhang PetscFunctionReturn(0); 210667a45760SJunchao Zhang } 210767a45760SJunchao Zhang 21089371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 210967a45760SJunchao Zhang PetscFunctionBegin; 21107e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 211167a45760SJunchao Zhang *array = NULL; 211267a45760SJunchao Zhang PetscFunctionReturn(0); 211367a45760SJunchao Zhang } 211467a45760SJunchao Zhang 21159371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 211667a45760SJunchao Zhang PetscFunctionBegin; 21179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 211867a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 211967a45760SJunchao Zhang PetscFunctionReturn(0); 212067a45760SJunchao Zhang } 212167a45760SJunchao Zhang 21229371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 212367a45760SJunchao Zhang PetscFunctionBegin; 212467a45760SJunchao Zhang *array = NULL; 212567a45760SJunchao Zhang PetscFunctionReturn(0); 212667a45760SJunchao Zhang } 212767a45760SJunchao Zhang 21289371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 212967a45760SJunchao Zhang PetscFunctionBegin; 213067a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 213167a45760SJunchao Zhang PetscFunctionReturn(0); 213267a45760SJunchao Zhang } 213367a45760SJunchao Zhang 21349371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 213567a45760SJunchao Zhang PetscFunctionBegin; 213667a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 213767a45760SJunchao Zhang *array = NULL; 21387e8381f9SStefano Zampini PetscFunctionReturn(0); 21397e8381f9SStefano Zampini } 21407e8381f9SStefano Zampini 21419371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 21427ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 21437ee59b9bSJunchao Zhang CsrMatrix *matrix; 21447ee59b9bSJunchao Zhang 21457ee59b9bSJunchao Zhang PetscFunctionBegin; 21467ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 21477ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 21487ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 21497ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 21507ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 21517ee59b9bSJunchao Zhang 21527ee59b9bSJunchao Zhang if (i) { 21537ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 21547ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 21557ee59b9bSJunchao Zhang #else 21567ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 21577ee59b9bSJunchao Zhang #endif 21587ee59b9bSJunchao Zhang } 21597ee59b9bSJunchao Zhang if (j) { 21607ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 21617ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 21627ee59b9bSJunchao Zhang #else 21637ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 21647ee59b9bSJunchao Zhang #endif 21657ee59b9bSJunchao Zhang } 21667ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 21677ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 21687ee59b9bSJunchao Zhang PetscFunctionReturn(0); 21697ee59b9bSJunchao Zhang } 21707ee59b9bSJunchao Zhang 21719371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) { 2172aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 21737c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 21749ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2175213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2176aa372e3fSPaul Mullowney cusparseStatus_t stat; 2177abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 21789ae82921SPaul Mullowney 21799ae82921SPaul Mullowney PetscFunctionBegin; 218028b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2181c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2182a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2183a49f1ed0SStefano Zampini CsrMatrix *matrix; 2184afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 218585ba7357SStefano Zampini 218608401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 21879566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2188afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 21899566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 21909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 21919566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 21929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 219334d6c7a5SJose E. Roman } else { 2194abb89eb1SStefano Zampini PetscInt nnz; 21959566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 21969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 21979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 21987c700b8dSJunchao Zhang delete cusparsestruct->workVector; 219981902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2200a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2201a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 22029ae82921SPaul Mullowney try { 22039ae82921SPaul Mullowney if (a->compressedrow.use) { 22049ae82921SPaul Mullowney m = a->compressedrow.nrows; 22059ae82921SPaul Mullowney ii = a->compressedrow.i; 22069ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 22079ae82921SPaul Mullowney } else { 2208213423ffSJunchao Zhang m = A->rmap->n; 2209213423ffSJunchao Zhang ii = a->i; 2210e6e9a74fSStefano Zampini ridx = NULL; 22119ae82921SPaul Mullowney } 221208401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 22139371c9d4SSatish Balay if (!a->a) { 22149371c9d4SSatish Balay nnz = ii[m]; 22159371c9d4SSatish Balay both = PETSC_FALSE; 22169371c9d4SSatish Balay } else nnz = a->nz; 221708401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 22189ae82921SPaul Mullowney 221985ba7357SStefano Zampini /* create cusparse matrix */ 2220abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2221aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 22229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 22239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 22249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 22259ae82921SPaul Mullowney 22269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 22279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 22289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 22299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2233b06137fdSPaul Mullowney 2234aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2235aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2236aa372e3fSPaul Mullowney /* set the matrix */ 2237afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2238afb2bd1cSJunchao Zhang mat->num_rows = m; 2239afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2240abb89eb1SStefano Zampini mat->num_entries = nnz; 2241afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2242afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 22439ae82921SPaul Mullowney 2244abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2245abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2246aa372e3fSPaul Mullowney 2247abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2248abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2249aa372e3fSPaul Mullowney 2250aa372e3fSPaul Mullowney /* assign the pointer */ 2251afb2bd1cSJunchao Zhang matstruct->mat = mat; 2252afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2253afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 22549371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 22559371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 22569371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2257afb2bd1cSJunchao Zhang } 2258afb2bd1cSJunchao Zhang #endif 2259aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2260afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2261afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2262afb2bd1cSJunchao Zhang #else 2263afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2264afb2bd1cSJunchao Zhang mat->num_rows = m; 2265afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2266abb89eb1SStefano Zampini mat->num_entries = nnz; 2267afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2268afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2269aa372e3fSPaul Mullowney 2270abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2271abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2272aa372e3fSPaul Mullowney 2273abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2274abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2275aa372e3fSPaul Mullowney 2276aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 22779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 22789371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 22799371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 22809371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2281aa372e3fSPaul Mullowney /* assign the pointer */ 2282aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2283aa372e3fSPaul Mullowney 2284afb2bd1cSJunchao Zhang if (mat) { 2285afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2286afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2287afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2288afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2289087f3262SPaul Mullowney } 2290afb2bd1cSJunchao Zhang #endif 2291087f3262SPaul Mullowney } 2292ca45077fSPaul Mullowney 2293aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2294213423ffSJunchao Zhang if (a->compressedrow.use) { 2295213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2296aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2297aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2298213423ffSJunchao Zhang tmp = m; 2299213423ffSJunchao Zhang } else { 2300213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2301213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2302213423ffSJunchao Zhang tmp = 0; 2303213423ffSJunchao Zhang } 23049566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2305aa372e3fSPaul Mullowney 2306aa372e3fSPaul Mullowney /* assign the pointer */ 2307aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 23089371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 23099566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 23109566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 231134d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 231234d6c7a5SJose E. Roman } 2313abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 23149ae82921SPaul Mullowney } 23159ae82921SPaul Mullowney PetscFunctionReturn(0); 23169ae82921SPaul Mullowney } 23179ae82921SPaul Mullowney 23189371c9d4SSatish Balay struct VecCUDAPlusEquals { 2319aa372e3fSPaul Mullowney template <typename Tuple> 23209371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 2321aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2322aa372e3fSPaul Mullowney } 2323aa372e3fSPaul Mullowney }; 2324aa372e3fSPaul Mullowney 23259371c9d4SSatish Balay struct VecCUDAEquals { 23267e8381f9SStefano Zampini template <typename Tuple> 23279371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 23287e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 23297e8381f9SStefano Zampini } 23307e8381f9SStefano Zampini }; 23317e8381f9SStefano Zampini 23329371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2333e6e9a74fSStefano Zampini template <typename Tuple> 23349371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 2335e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2336e6e9a74fSStefano Zampini } 2337e6e9a74fSStefano Zampini }; 2338e6e9a74fSStefano Zampini 2339afb2bd1cSJunchao Zhang struct MatMatCusparse { 2340ccdfe979SStefano Zampini PetscBool cisdense; 2341ccdfe979SStefano Zampini PetscScalar *Bt; 2342ccdfe979SStefano Zampini Mat X; 2343fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2344fcdce8c4SStefano Zampini PetscLogDouble flops; 2345fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2346b4285af6SJunchao Zhang 2347afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2348fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2349afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2350afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2351afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2352afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2353b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2354b4285af6SJunchao Zhang void *dBuffer4; 2355b4285af6SJunchao Zhang void *dBuffer5; 2356b4285af6SJunchao Zhang #endif 2357fcdce8c4SStefano Zampini size_t mmBufferSize; 2358fcdce8c4SStefano Zampini void *mmBuffer; 2359fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2360fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2361afb2bd1cSJunchao Zhang #endif 2362afb2bd1cSJunchao Zhang }; 2363ccdfe979SStefano Zampini 23649371c9d4SSatish Balay static PetscErrorCode MatDestroy_MatMatCusparse(void *data) { 2365ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2366ccdfe979SStefano Zampini 2367ccdfe979SStefano Zampini PetscFunctionBegin; 23689566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2369fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2370afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 23719566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 23729566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 23739566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 23749566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2375b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 23769566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 23779566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2378b4285af6SJunchao Zhang #endif 23799566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 23809566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2381afb2bd1cSJunchao Zhang #endif 23829566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 23839566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2384ccdfe979SStefano Zampini PetscFunctionReturn(0); 2385ccdfe979SStefano Zampini } 2386ccdfe979SStefano Zampini 2387ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2388ccdfe979SStefano Zampini 23899371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2390ccdfe979SStefano Zampini Mat_Product *product = C->product; 2391ccdfe979SStefano Zampini Mat A, B; 2392afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2393ccdfe979SStefano Zampini PetscBool flg, biscuda; 2394ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2395ccdfe979SStefano Zampini cusparseStatus_t stat; 2396ccdfe979SStefano Zampini cusparseOperation_t opA; 2397ccdfe979SStefano Zampini const PetscScalar *barray; 2398ccdfe979SStefano Zampini PetscScalar *carray; 2399ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2400ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2401ccdfe979SStefano Zampini CsrMatrix *csrmat; 2402ccdfe979SStefano Zampini 2403ccdfe979SStefano Zampini PetscFunctionBegin; 2404ccdfe979SStefano Zampini MatCheckProduct(C, 1); 240528b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2406ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2407ccdfe979SStefano Zampini A = product->A; 2408ccdfe979SStefano Zampini B = product->B; 24099566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 241028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2411ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2412ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 241328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 24149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2415ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2416ccdfe979SStefano Zampini switch (product->type) { 2417ccdfe979SStefano Zampini case MATPRODUCT_AB: 2418ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2419ccdfe979SStefano Zampini mat = cusp->mat; 2420ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2421ccdfe979SStefano Zampini m = A->rmap->n; 2422ccdfe979SStefano Zampini n = B->cmap->n; 2423ccdfe979SStefano Zampini break; 2424ccdfe979SStefano Zampini case MATPRODUCT_AtB: 24251a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2426e6e9a74fSStefano Zampini mat = cusp->mat; 2427e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2428e6e9a74fSStefano Zampini } else { 24299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2430ccdfe979SStefano Zampini mat = cusp->matTranspose; 2431ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2432e6e9a74fSStefano Zampini } 2433ccdfe979SStefano Zampini m = A->cmap->n; 2434ccdfe979SStefano Zampini n = B->cmap->n; 2435ccdfe979SStefano Zampini break; 2436ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2437ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2438ccdfe979SStefano Zampini mat = cusp->mat; 2439ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2440ccdfe979SStefano Zampini m = A->rmap->n; 2441ccdfe979SStefano Zampini n = B->rmap->n; 2442ccdfe979SStefano Zampini break; 24439371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2444ccdfe979SStefano Zampini } 244528b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2446ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2447ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 24489566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 24499566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 24509566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2451afb2bd1cSJunchao Zhang 24529566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2453c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 24549566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 24559566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2456c8378d12SStefano Zampini } else { 24579566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 24589566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2459c8378d12SStefano Zampini } 2460c8378d12SStefano Zampini 24619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2462afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2463afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2464a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2465afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2466fcdce8c4SStefano Zampini size_t mmBufferSize; 24679371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 24689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 24699371c9d4SSatish Balay mmdata->matBDescr = NULL; 24709371c9d4SSatish Balay } 2471afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 24729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2473afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2474afb2bd1cSJunchao Zhang } 2475c8378d12SStefano Zampini 24769371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 24779371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 24789371c9d4SSatish Balay mmdata->matCDescr = NULL; 24799371c9d4SSatish Balay } 2480afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 24819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2482afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2483afb2bd1cSJunchao Zhang } 2484afb2bd1cSJunchao Zhang 2485afb2bd1cSJunchao Zhang if (!mat->matDescr) { 24869371c9d4SSatish Balay stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 24879371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 24889371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2489afb2bd1cSJunchao Zhang } 24909371c9d4SSatish Balay stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 24919371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2492fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 24939566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 24949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2495fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2496fcdce8c4SStefano Zampini } 2497afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2498afb2bd1cSJunchao Zhang } else { 2499afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 25009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 25019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 25029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2503afb2bd1cSJunchao Zhang } 2504afb2bd1cSJunchao Zhang 2505afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 25069371c9d4SSatish Balay stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 25079371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2508afb2bd1cSJunchao Zhang #else 2509afb2bd1cSJunchao Zhang PetscInt k; 2510afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2511ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2512ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2513ccdfe979SStefano Zampini cublasStatus_t cerr; 2514ccdfe979SStefano Zampini 25159566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 25169371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 25179371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2518ccdfe979SStefano Zampini blda = B->cmap->n; 2519afb2bd1cSJunchao Zhang k = B->cmap->n; 2520afb2bd1cSJunchao Zhang } else { 2521afb2bd1cSJunchao Zhang k = B->rmap->n; 2522ccdfe979SStefano Zampini } 2523ccdfe979SStefano Zampini 2524afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 25259371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 25269371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2527afb2bd1cSJunchao Zhang #endif 25289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 25299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 25309566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2531ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 25329566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25339566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2534ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 25359566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25369566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2537ccdfe979SStefano Zampini } else { 25389566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2539ccdfe979SStefano Zampini } 254048a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 254148a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2542ccdfe979SStefano Zampini PetscFunctionReturn(0); 2543ccdfe979SStefano Zampini } 2544ccdfe979SStefano Zampini 25459371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2546ccdfe979SStefano Zampini Mat_Product *product = C->product; 2547ccdfe979SStefano Zampini Mat A, B; 2548ccdfe979SStefano Zampini PetscInt m, n; 2549ccdfe979SStefano Zampini PetscBool cisdense, flg; 2550ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2551ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2552ccdfe979SStefano Zampini 2553ccdfe979SStefano Zampini PetscFunctionBegin; 2554ccdfe979SStefano Zampini MatCheckProduct(C, 1); 255528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2556ccdfe979SStefano Zampini A = product->A; 2557ccdfe979SStefano Zampini B = product->B; 25589566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 255928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2560ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 256108401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2562ccdfe979SStefano Zampini switch (product->type) { 2563ccdfe979SStefano Zampini case MATPRODUCT_AB: 2564ccdfe979SStefano Zampini m = A->rmap->n; 2565ccdfe979SStefano Zampini n = B->cmap->n; 2566ccdfe979SStefano Zampini break; 2567ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2568ccdfe979SStefano Zampini m = A->cmap->n; 2569ccdfe979SStefano Zampini n = B->cmap->n; 2570ccdfe979SStefano Zampini break; 2571ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2572ccdfe979SStefano Zampini m = A->rmap->n; 2573ccdfe979SStefano Zampini n = B->rmap->n; 2574ccdfe979SStefano Zampini break; 2575ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2576ccdfe979SStefano Zampini m = B->cmap->n; 2577ccdfe979SStefano Zampini n = B->cmap->n; 2578ccdfe979SStefano Zampini break; 2579ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2580ccdfe979SStefano Zampini m = B->rmap->n; 2581ccdfe979SStefano Zampini n = B->rmap->n; 2582ccdfe979SStefano Zampini break; 25839371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2584ccdfe979SStefano Zampini } 25859566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2586ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 25879566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 25889566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2589ccdfe979SStefano Zampini 2590ccdfe979SStefano Zampini /* product data */ 25919566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2592ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2593afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2594afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 259548a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2596afb2bd1cSJunchao Zhang #endif 2597ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2598ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 25999566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 26009566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2601ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 26029566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2603ccdfe979SStefano Zampini } else { 26049566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2605ccdfe979SStefano Zampini } 2606ccdfe979SStefano Zampini } 2607ccdfe979SStefano Zampini C->product->data = mmdata; 2608ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2609ccdfe979SStefano Zampini 2610ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2611ccdfe979SStefano Zampini PetscFunctionReturn(0); 2612ccdfe979SStefano Zampini } 2613ccdfe979SStefano Zampini 26149371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2615ccdfe979SStefano Zampini Mat_Product *product = C->product; 2616fcdce8c4SStefano Zampini Mat A, B; 2617fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2618fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2619fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2620fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2621fcdce8c4SStefano Zampini PetscBool flg; 2622fcdce8c4SStefano Zampini cusparseStatus_t stat; 2623fcdce8c4SStefano Zampini MatProductType ptype; 2624fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2625fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2627fcdce8c4SStefano Zampini #endif 2628b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2629ccdfe979SStefano Zampini 2630ccdfe979SStefano Zampini PetscFunctionBegin; 2631ccdfe979SStefano Zampini MatCheckProduct(C, 1); 263228b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 26339566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 263428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2635fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2636fcdce8c4SStefano Zampini A = product->A; 2637fcdce8c4SStefano Zampini B = product->B; 2638fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2639fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2640fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 264108401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2642fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 264328b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2644fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 264528b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2646fcdce8c4SStefano Zampini goto finalize; 2647fcdce8c4SStefano Zampini } 2648fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 26499566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 265028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 26519566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 265228b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 265328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 265428b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2655fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2656fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2657fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 265808401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 265908401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 266008401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 26619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 26629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2663fcdce8c4SStefano Zampini 2664fcdce8c4SStefano Zampini ptype = product->type; 2665b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2666fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 266728b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2668fa046f9fSJunchao Zhang } 2669b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2670fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 267128b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2672fa046f9fSJunchao Zhang } 2673fcdce8c4SStefano Zampini switch (ptype) { 2674fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2675fcdce8c4SStefano Zampini Amat = Acusp->mat; 2676fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2677fcdce8c4SStefano Zampini break; 2678fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2679fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2680fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2681fcdce8c4SStefano Zampini break; 2682fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2683fcdce8c4SStefano Zampini Amat = Acusp->mat; 2684fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2685fcdce8c4SStefano Zampini break; 26869371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2687fcdce8c4SStefano Zampini } 2688fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 268928b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 269028b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 269128b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2692fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2693fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2694fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 269528b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 269628b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 269728b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 26989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2699fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2700fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 27019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2702b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 27039371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 27049371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2705b4285af6SJunchao Zhang #else 27069371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 27079371c9d4SSatish Balay PetscCallCUSPARSE(stat); 27089371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 27099371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2710b4285af6SJunchao Zhang #endif 2711fcdce8c4SStefano Zampini #else 27129371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 27139371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 27149371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2715fcdce8c4SStefano Zampini #endif 27169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27179566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 27189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2719fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2720fcdce8c4SStefano Zampini finalize: 2721fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 27229566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 27239566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 27249566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2725fcdce8c4SStefano Zampini c->reallocs = 0; 2726fcdce8c4SStefano Zampini C->info.mallocs += 0; 2727fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2728fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2729fcdce8c4SStefano Zampini C->num_ass++; 2730ccdfe979SStefano Zampini PetscFunctionReturn(0); 2731ccdfe979SStefano Zampini } 2732fcdce8c4SStefano Zampini 27339371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2734fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2735fcdce8c4SStefano Zampini Mat A, B; 2736fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2737fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 2738fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2739fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2740fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 2741fcdce8c4SStefano Zampini PetscBool flg; 2742fcdce8c4SStefano Zampini cusparseStatus_t stat; 2743fcdce8c4SStefano Zampini MatProductType ptype; 2744fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2745fcdce8c4SStefano Zampini PetscLogDouble flops; 2746fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 2747fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2748fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2749fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2750fcdce8c4SStefano Zampini #else 2751fcdce8c4SStefano Zampini int cnz; 2752fcdce8c4SStefano Zampini #endif 2753b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2754fcdce8c4SStefano Zampini 2755fcdce8c4SStefano Zampini PetscFunctionBegin; 2756fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 275728b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2758fcdce8c4SStefano Zampini A = product->A; 2759fcdce8c4SStefano Zampini B = product->B; 27609566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 276128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 27629566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 276328b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2764fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 2765fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 2766fcdce8c4SStefano Zampini /* product data */ 27679566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2768fcdce8c4SStefano Zampini C->product->data = mmdata; 2769fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2770fcdce8c4SStefano Zampini 27719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 27729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2773d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2774d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 277508401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 277608401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2777d60bce21SJunchao Zhang 2778fcdce8c4SStefano Zampini ptype = product->type; 2779b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2780fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2781fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2782fa046f9fSJunchao Zhang } 2783b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2784fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2785fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2786fa046f9fSJunchao Zhang } 2787fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2788fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2789fcdce8c4SStefano Zampini switch (ptype) { 2790fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2791fcdce8c4SStefano Zampini m = A->rmap->n; 2792fcdce8c4SStefano Zampini n = B->cmap->n; 2793fcdce8c4SStefano Zampini k = A->cmap->n; 2794fcdce8c4SStefano Zampini Amat = Acusp->mat; 2795fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2796fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2797fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2798fcdce8c4SStefano Zampini break; 2799fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2800fcdce8c4SStefano Zampini m = A->cmap->n; 2801fcdce8c4SStefano Zampini n = B->cmap->n; 2802fcdce8c4SStefano Zampini k = A->rmap->n; 28039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2804fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2805fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2806fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2807fcdce8c4SStefano Zampini break; 2808fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2809fcdce8c4SStefano Zampini m = A->rmap->n; 2810fcdce8c4SStefano Zampini n = B->rmap->n; 2811fcdce8c4SStefano Zampini k = A->cmap->n; 28129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2813fcdce8c4SStefano Zampini Amat = Acusp->mat; 2814fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2815fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2816fcdce8c4SStefano Zampini break; 28179371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2818fcdce8c4SStefano Zampini } 2819fcdce8c4SStefano Zampini 2820fcdce8c4SStefano Zampini /* create cusparse matrix */ 28219566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 28229566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2823fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 2824fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2825fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2826fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2827fcdce8c4SStefano Zampini 2828fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2829fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2830fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 28319566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 28329566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2833fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2834fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2835fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2836fcdce8c4SStefano Zampini } else { 2837fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2838fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2839fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2840fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2841fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2842fcdce8c4SStefano Zampini } 2843fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2844fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2845fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2846fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2847fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2848fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 28499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 28509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 28519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 28529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 28539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 28549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 28559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 28569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 28579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2858fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2859fcdce8c4SStefano Zampini thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2860fcdce8c4SStefano Zampini c->nz = 0; 2861fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2862fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2863fcdce8c4SStefano Zampini goto finalizesym; 2864fcdce8c4SStefano Zampini } 2865fcdce8c4SStefano Zampini 286628b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 286728b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2868fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2869fcdce8c4SStefano Zampini if (!biscompressed) { 2870fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 2871fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2872fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2873fcdce8c4SStefano Zampini #endif 2874fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2875fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2876fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2877fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2878fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2879fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2880fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2881fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2882fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2883fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2884fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 28859566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2886fcdce8c4SStefano Zampini } 2887fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2888fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2889fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2890fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 28919371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 28929371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2893fcdce8c4SStefano Zampini } 2894fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2895fcdce8c4SStefano Zampini #endif 2896fcdce8c4SStefano Zampini } 289728b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 289828b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2899fcdce8c4SStefano Zampini /* precompute flops count */ 2900fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2901fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2902fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2903fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 2904fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 2905fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2906fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 2907fcdce8c4SStefano Zampini } 2908fcdce8c4SStefano Zampini } 2909fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2910fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2911fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 2912fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2913fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 2914fcdce8c4SStefano Zampini } 2915fcdce8c4SStefano Zampini } else { /* TODO */ 2916fcdce8c4SStefano Zampini flops = 0.; 2917fcdce8c4SStefano Zampini } 2918fcdce8c4SStefano Zampini 2919fcdce8c4SStefano Zampini mmdata->flops = flops; 29209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2921b4285af6SJunchao Zhang 2922fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 29249371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 29259371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2927b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2928b4285af6SJunchao Zhang { 2929b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2930b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2931b4285af6SJunchao Zhang */ 2932b4285af6SJunchao Zhang void *dBuffer1 = NULL; 2933b4285af6SJunchao Zhang void *dBuffer2 = NULL; 2934b4285af6SJunchao Zhang void *dBuffer3 = NULL; 2935b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2936b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2937b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2938b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2939b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2940b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2941b4285af6SJunchao Zhang 2942b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2943b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 29449371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 29459371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2947b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 29489371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 29499371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2950b4285af6SJunchao Zhang 2951b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 29529371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 29539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 29559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 29569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 29579371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 29589371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29599566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 29609566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 2961b4285af6SJunchao Zhang 2962b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2963b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 29649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2965b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 2966b4285af6SJunchao Zhang /* allocate matrix C */ 29679371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 29689371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 29699371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 29709371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2971b4285af6SJunchao Zhang /* update matC with the new pointers */ 29729371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 29739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2974b4285af6SJunchao Zhang 2975b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 29769371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 29779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 29799371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 29809371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29819566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 29829371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29839371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29849566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2985b4285af6SJunchao Zhang } 2986ae37ee31SJunchao Zhang #else 2987b4285af6SJunchao Zhang size_t bufSize2; 2988fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 29899371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 29909371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2992fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 29939371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 29949371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2995fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 29969371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 29979371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2998fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2999fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3000fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3001fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3002fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 30039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3004fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 30059371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 30069371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3007fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 30089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3009fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 30109371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 30119371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3012fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30139566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3014fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30159566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 30169371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 30179371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30189371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3020ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3021fcdce8c4SStefano Zampini #else 30229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 30239371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30249371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 30259371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3026fcdce8c4SStefano Zampini c->nz = cnz; 3027fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30289566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3029fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30309566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3031fcdce8c4SStefano Zampini 30329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3033fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3034fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3035fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 30369371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30379371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 30389371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3039fcdce8c4SStefano Zampini #endif 30409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 30419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3042fcdce8c4SStefano Zampini finalizesym: 3043fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3044fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3045fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 30469566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 30479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 3048fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3049fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3050fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3051fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3052fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3053fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3054fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 30559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 30569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3057fcdce8c4SStefano Zampini } else { 3058fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3059fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 30609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 30619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3062fcdce8c4SStefano Zampini } 3063fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3064fcdce8c4SStefano Zampini PetscInt r = 0; 3065fcdce8c4SStefano Zampini c->i[0] = 0; 3066fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3067fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3068fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3069fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3070fcdce8c4SStefano Zampini } 3071fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3072fcdce8c4SStefano Zampini } 30739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 30749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 30759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3076fcdce8c4SStefano Zampini c->maxnz = c->nz; 3077fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3078fcdce8c4SStefano Zampini c->rmax = 0; 3079fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3080fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3081fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3082fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 3083fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3084fcdce8c4SStefano Zampini } 30859566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 30869566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3087fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3088fcdce8c4SStefano Zampini 3089fcdce8c4SStefano Zampini C->nonzerostate++; 30909566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 30919566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3092fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3093fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3094fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3095fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3096fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3097abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3098fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3099fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3100fcdce8c4SStefano Zampini } 3101fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3102fcdce8c4SStefano Zampini PetscFunctionReturn(0); 3103fcdce8c4SStefano Zampini } 3104fcdce8c4SStefano Zampini 3105fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3106fcdce8c4SStefano Zampini 3107fcdce8c4SStefano Zampini /* handles sparse or dense B */ 31089371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) { 3109fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3110fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3111fcdce8c4SStefano Zampini 3112fcdce8c4SStefano Zampini PetscFunctionBegin; 3113fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 31149566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 311548a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3116fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3117fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 311848a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3119fcdce8c4SStefano Zampini } 312065e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 312165e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 312265e4b4d4SStefano Zampini switch (product->type) { 312365e4b4d4SStefano Zampini case MATPRODUCT_AB: 312465e4b4d4SStefano Zampini if (product->api_user) { 3125d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 31269566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3127d0609cedSBarry Smith PetscOptionsEnd(); 312865e4b4d4SStefano Zampini } else { 3129d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 31309566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3131d0609cedSBarry Smith PetscOptionsEnd(); 313265e4b4d4SStefano Zampini } 313365e4b4d4SStefano Zampini break; 313465e4b4d4SStefano Zampini case MATPRODUCT_AtB: 313565e4b4d4SStefano Zampini if (product->api_user) { 3136d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 31379566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3138d0609cedSBarry Smith PetscOptionsEnd(); 313965e4b4d4SStefano Zampini } else { 3140d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 31419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3142d0609cedSBarry Smith PetscOptionsEnd(); 314365e4b4d4SStefano Zampini } 314465e4b4d4SStefano Zampini break; 314565e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 314665e4b4d4SStefano Zampini if (product->api_user) { 3147d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 31489566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3149d0609cedSBarry Smith PetscOptionsEnd(); 315065e4b4d4SStefano Zampini } else { 3151d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 31529566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3153d0609cedSBarry Smith PetscOptionsEnd(); 315465e4b4d4SStefano Zampini } 315565e4b4d4SStefano Zampini break; 315665e4b4d4SStefano Zampini case MATPRODUCT_RARt: 315765e4b4d4SStefano Zampini if (product->api_user) { 3158d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 31599566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3160d0609cedSBarry Smith PetscOptionsEnd(); 316165e4b4d4SStefano Zampini } else { 3162d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 31639566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3164d0609cedSBarry Smith PetscOptionsEnd(); 316565e4b4d4SStefano Zampini } 316665e4b4d4SStefano Zampini break; 316765e4b4d4SStefano Zampini case MATPRODUCT_ABC: 316865e4b4d4SStefano Zampini if (product->api_user) { 3169d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 31709566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3171d0609cedSBarry Smith PetscOptionsEnd(); 317265e4b4d4SStefano Zampini } else { 3173d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 31749566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3175d0609cedSBarry Smith PetscOptionsEnd(); 317665e4b4d4SStefano Zampini } 317765e4b4d4SStefano Zampini break; 31789371c9d4SSatish Balay default: break; 317965e4b4d4SStefano Zampini } 318065e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 318165e4b4d4SStefano Zampini } 318265e4b4d4SStefano Zampini /* dispatch */ 3183fcdce8c4SStefano Zampini if (isdense) { 3184ccdfe979SStefano Zampini switch (product->type) { 3185ccdfe979SStefano Zampini case MATPRODUCT_AB: 3186ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3187ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3188ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3189ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3190fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 31919566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3192fcdce8c4SStefano Zampini } else { 3193fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3194fcdce8c4SStefano Zampini } 3195fcdce8c4SStefano Zampini break; 31969371c9d4SSatish Balay case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 31979371c9d4SSatish Balay default: break; 3198ccdfe979SStefano Zampini } 3199fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3200fcdce8c4SStefano Zampini switch (product->type) { 3201fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3202fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 32039371c9d4SSatish Balay case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break; 3204fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3205fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 32069371c9d4SSatish Balay case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 32079371c9d4SSatish Balay default: break; 3208fcdce8c4SStefano Zampini } 3209fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 32109566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3211fcdce8c4SStefano Zampini } 3212ccdfe979SStefano Zampini PetscFunctionReturn(0); 3213ccdfe979SStefano Zampini } 3214ccdfe979SStefano Zampini 32159371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 32169ae82921SPaul Mullowney PetscFunctionBegin; 32179566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3218e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3219e6e9a74fSStefano Zampini } 3220e6e9a74fSStefano Zampini 32219371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3222e6e9a74fSStefano Zampini PetscFunctionBegin; 32239566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3224e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3225e6e9a74fSStefano Zampini } 3226e6e9a74fSStefano Zampini 32279371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3228e6e9a74fSStefano Zampini PetscFunctionBegin; 32299566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3230e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3231e6e9a74fSStefano Zampini } 3232e6e9a74fSStefano Zampini 32339371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3234e6e9a74fSStefano Zampini PetscFunctionBegin; 32359566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 32369ae82921SPaul Mullowney PetscFunctionReturn(0); 32379ae82921SPaul Mullowney } 32389ae82921SPaul Mullowney 32399371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3240ca45077fSPaul Mullowney PetscFunctionBegin; 32419566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3242ca45077fSPaul Mullowney PetscFunctionReturn(0); 3243ca45077fSPaul Mullowney } 3244ca45077fSPaul Mullowney 32459371c9d4SSatish Balay __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) { 3246a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3247a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3248a0e72f99SJunchao Zhang } 3249a0e72f99SJunchao Zhang 3250afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 32519371c9d4SSatish Balay static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) { 32529ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3253aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 32549ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3255e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3256e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3257e6e9a74fSStefano Zampini PetscBool compressed; 3258afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3259afb2bd1cSJunchao Zhang PetscInt nx, ny; 3260afb2bd1cSJunchao Zhang #endif 32616e111a19SKarl Rupp 32629ae82921SPaul Mullowney PetscFunctionBegin; 326308401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3264cbc6b225SStefano Zampini if (!a->nz) { 32659566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 32669566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3267e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3268e6e9a74fSStefano Zampini } 326934d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 32709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3271e6e9a74fSStefano Zampini if (!trans) { 32729ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 32735f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3274e6e9a74fSStefano Zampini } else { 32751a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3276e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3277e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3278e6e9a74fSStefano Zampini } else { 32799566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3280e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3281e6e9a74fSStefano Zampini } 3282e6e9a74fSStefano Zampini } 3283e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3284e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3285213423ffSJunchao Zhang 3286e6e9a74fSStefano Zampini try { 32879566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 32889566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 32899566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3290afb2bd1cSJunchao Zhang 32919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3292e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3293afb2bd1cSJunchao Zhang /* z = A x + beta y. 3294afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3295afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3296afb2bd1cSJunchao Zhang */ 3297e6e9a74fSStefano Zampini xptr = xarray; 3298afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3299213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3300afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3301afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3302afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3303afb2bd1cSJunchao Zhang */ 3304afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3305afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3306afb2bd1cSJunchao Zhang nx = mat->num_cols; 3307afb2bd1cSJunchao Zhang ny = mat->num_rows; 3308afb2bd1cSJunchao Zhang } 3309afb2bd1cSJunchao Zhang #endif 3310e6e9a74fSStefano Zampini } else { 3311afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3312afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3313afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3314afb2bd1cSJunchao Zhang */ 3315afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3316e6e9a74fSStefano Zampini dptr = zarray; 3317e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3318afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3319e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3320*d0967f54SJacob Faibussowitsch 3321*d0967f54SJacob Faibussowitsch thrust::for_each( 3322*d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3323*d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3324*d0967f54SJacob Faibussowitsch #endif 3325*d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 33269371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3327e6e9a74fSStefano Zampini } 3328afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3329afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3330afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3331afb2bd1cSJunchao Zhang nx = mat->num_rows; 3332afb2bd1cSJunchao Zhang ny = mat->num_cols; 3333afb2bd1cSJunchao Zhang } 3334afb2bd1cSJunchao Zhang #endif 3335e6e9a74fSStefano Zampini } 33369ae82921SPaul Mullowney 3337afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3338aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3339afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 33405f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3341afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 33429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 33439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 33449371c9d4SSatish Balay PetscCallCUSPARSE( 33459371c9d4SSatish Balay cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 33469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3347afb2bd1cSJunchao Zhang 3348afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3349afb2bd1cSJunchao Zhang } else { 3350afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 33519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 33529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3353afb2bd1cSJunchao Zhang } 3354afb2bd1cSJunchao Zhang 33559371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 33569371c9d4SSatish Balay matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3357afb2bd1cSJunchao Zhang #else 33587656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 33599371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3360afb2bd1cSJunchao Zhang #endif 3361aa372e3fSPaul Mullowney } else { 3362213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3363afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3364afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3365afb2bd1cSJunchao Zhang #else 3366301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 33679371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3368afb2bd1cSJunchao Zhang #endif 3369a65300a6SPaul Mullowney } 3370aa372e3fSPaul Mullowney } 33719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3372aa372e3fSPaul Mullowney 3373e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3374213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3375213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 33769566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3377e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 33789566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 33797656d835SStefano Zampini } 3380213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 33819566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz, 0)); 33827656d835SStefano Zampini } 33837656d835SStefano Zampini 3384213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3385213423ffSJunchao Zhang if (compressed) { 33869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3387a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3388a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3389a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3390a0e72f99SJunchao Zhang */ 3391a0e72f99SJunchao Zhang #if 0 3392a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3393a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3394a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3395e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3396c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3397a0e72f99SJunchao Zhang #else 3398a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3399a0e72f99SJunchao Zhang ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3400a0e72f99SJunchao Zhang #endif 34019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3402e6e9a74fSStefano Zampini } 3403e6e9a74fSStefano Zampini } else { 34049371c9d4SSatish Balay if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3405e6e9a74fSStefano Zampini } 34069566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 34079566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 34089566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 34099371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3410e6e9a74fSStefano Zampini if (yy) { 34119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3412e6e9a74fSStefano Zampini } else { 34139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3414e6e9a74fSStefano Zampini } 34159ae82921SPaul Mullowney PetscFunctionReturn(0); 34169ae82921SPaul Mullowney } 34179ae82921SPaul Mullowney 34189371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3419ca45077fSPaul Mullowney PetscFunctionBegin; 34209566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3421ca45077fSPaul Mullowney PetscFunctionReturn(0); 3422ca45077fSPaul Mullowney } 3423ca45077fSPaul Mullowney 34249371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) { 3425042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3426042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 34273fa6b06aSMark Adams 3428042217e8SBarry Smith PetscFunctionBegin; 34299566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3430042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 34319566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 34329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3433042217e8SBarry Smith cusp->deviceMat = NULL; 3434042217e8SBarry Smith } 34359ae82921SPaul Mullowney PetscFunctionReturn(0); 34369ae82921SPaul Mullowney } 34379ae82921SPaul Mullowney 34389ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3439e057df02SPaul Mullowney /*@ 344011a5261eSBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3441e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 344211a5261eSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3443e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3444e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3445e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 34469ae82921SPaul Mullowney 3447d083f849SBarry Smith Collective 34489ae82921SPaul Mullowney 34499ae82921SPaul Mullowney Input Parameters: 345011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 34519ae82921SPaul Mullowney . m - number of rows 34529ae82921SPaul Mullowney . n - number of columns 34539ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 34549ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 34550298fd71SBarry Smith (possibly different for each row) or NULL 34569ae82921SPaul Mullowney 34579ae82921SPaul Mullowney Output Parameter: 34589ae82921SPaul Mullowney . A - the matrix 34599ae82921SPaul Mullowney 346011a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 34619ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 346211a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 34639ae82921SPaul Mullowney 34649ae82921SPaul Mullowney Notes: 34659ae82921SPaul Mullowney If nnz is given then nz is ignored 34669ae82921SPaul Mullowney 346711a5261eSBarry Smith The AIJ format, also called 346811a5261eSBarry Smith compressed row storage, is fully compatible with standard Fortran 77 34699ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 34709ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 34719ae82921SPaul Mullowney 34729ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 347311a5261eSBarry Smith Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 34749ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 34759ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 34769ae82921SPaul Mullowney 34779ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 34789ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 34799ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 34809ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 34819ae82921SPaul Mullowney 34829ae82921SPaul Mullowney Level: intermediate 34839ae82921SPaul Mullowney 348411a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 34859ae82921SPaul Mullowney @*/ 34869371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 34879ae82921SPaul Mullowney PetscFunctionBegin; 34889566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 34899566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 34909566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 34919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 34929ae82921SPaul Mullowney PetscFunctionReturn(0); 34939ae82921SPaul Mullowney } 34949ae82921SPaul Mullowney 34959371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) { 34969ae82921SPaul Mullowney PetscFunctionBegin; 34979ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 34989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 34999ae82921SPaul Mullowney } else { 35009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3501aa372e3fSPaul Mullowney } 35029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 35039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 35049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 35059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 35069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 35079566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 35089566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 35099566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 35109566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 35119566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 35129566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 35139ae82921SPaul Mullowney PetscFunctionReturn(0); 35149ae82921SPaul Mullowney } 35159ae82921SPaul Mullowney 3516ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 351795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 35189371c9d4SSatish Balay static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) { 35199ff858a8SKarl Rupp PetscFunctionBegin; 35209566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 35219566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 35229ff858a8SKarl Rupp PetscFunctionReturn(0); 35239ff858a8SKarl Rupp } 35249ff858a8SKarl Rupp 35259371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) { 3526a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3527039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3528039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3529039c6fbaSStefano Zampini PetscScalar *ay; 3530039c6fbaSStefano Zampini const PetscScalar *ax; 3531039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3532e6e9a74fSStefano Zampini 353395639643SRichard Tran Mills PetscFunctionBegin; 3534a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3535a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3536039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 35379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 35389566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3539a587d139SMark PetscFunctionReturn(0); 354095639643SRichard Tran Mills } 3541039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 35429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 35439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 35445f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 35455f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3546039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3547039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3548039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3549039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3550039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3551ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3552039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3553039c6fbaSStefano Zampini } 3554d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3555d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3556039c6fbaSStefano Zampini 3557039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3558039c6fbaSStefano Zampini PetscScalar b = 1.0; 3559039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3560039c6fbaSStefano Zampini size_t bufferSize; 3561039c6fbaSStefano Zampini void *buffer; 3562039c6fbaSStefano Zampini #endif 3563039c6fbaSStefano Zampini 35649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 35659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 35669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3567039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 35689371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35699371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 35709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 35719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35729371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35739371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 35749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 35759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 35769566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3577039c6fbaSStefano Zampini #else 35789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35799371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35809371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 35819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 35829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3583039c6fbaSStefano Zampini #endif 35849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 35859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 35869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 35879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3588039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3589a587d139SMark cublasHandle_t cublasv2handle; 3590a587d139SMark PetscBLASInt one = 1, bnz = 1; 3591039c6fbaSStefano Zampini 35929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 35939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 35949566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 35959566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 35969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35979566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 35989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 35999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 36009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 36019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 36029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3603039c6fbaSStefano Zampini } else { 36049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 36059566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3606a587d139SMark } 360795639643SRichard Tran Mills PetscFunctionReturn(0); 360895639643SRichard Tran Mills } 360995639643SRichard Tran Mills 36109371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) { 361133c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 361233c9ba73SStefano Zampini PetscScalar *ay; 361333c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 361433c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 361533c9ba73SStefano Zampini 361633c9ba73SStefano Zampini PetscFunctionBegin; 36179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 36189566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 36199566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 36209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 36219566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 36229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 36239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 36249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 36259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 362633c9ba73SStefano Zampini PetscFunctionReturn(0); 362733c9ba73SStefano Zampini } 362833c9ba73SStefano Zampini 36299371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) { 36307e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3631a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 36327e8381f9SStefano Zampini 36333fa6b06aSMark Adams PetscFunctionBegin; 36343fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 36353fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 36367e8381f9SStefano Zampini if (spptr->mat) { 36377e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 36387e8381f9SStefano Zampini if (matrix->values) { 36397e8381f9SStefano Zampini both = PETSC_TRUE; 36407e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 36417e8381f9SStefano Zampini } 36427e8381f9SStefano Zampini } 36437e8381f9SStefano Zampini if (spptr->matTranspose) { 36447e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3645ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 36467e8381f9SStefano Zampini } 36473fa6b06aSMark Adams } 36489566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 36499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 36507e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3651a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 36523fa6b06aSMark Adams PetscFunctionReturn(0); 36533fa6b06aSMark Adams } 36543fa6b06aSMark Adams 36559371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) { 3656a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3657a587d139SMark 3658a587d139SMark PetscFunctionBegin; 36599a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 36609a14fc28SStefano Zampini A->boundtocpu = flg; 36619a14fc28SStefano Zampini PetscFunctionReturn(0); 36629a14fc28SStefano Zampini } 3663a587d139SMark if (flg) { 36649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3665a587d139SMark 366633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3667a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3668a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3669a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3670a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3671a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3672a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3673a587d139SMark A->ops->multhermitiantranspose = NULL; 3674a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3675fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 36769566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 36779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 36789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 36799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 36809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 36819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 36829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3683a587d139SMark } else { 368433c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3685a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3686a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3687a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3688a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3689a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3690a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3691a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3692a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3693fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 369467a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 369567a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 369667a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 369767a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 369867a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 369967a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 37007ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 37017ee59b9bSJunchao Zhang 37029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 37039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 37049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 37059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 37069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 37079566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3708a587d139SMark } 3709a587d139SMark A->boundtocpu = flg; 3710ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3711ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3712ea500dcfSRichard Tran Mills } else { 3713ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3714ea500dcfSRichard Tran Mills } 3715a587d139SMark PetscFunctionReturn(0); 3716a587d139SMark } 3717a587d139SMark 37189371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 371949735bf3SStefano Zampini Mat B; 37209ae82921SPaul Mullowney 37219ae82921SPaul Mullowney PetscFunctionBegin; 37229566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 372349735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 37249566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 372549735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 37269566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 372749735bf3SStefano Zampini } 372849735bf3SStefano Zampini B = *newmat; 372949735bf3SStefano Zampini 37309566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 37319566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 373234136279SStefano Zampini 373349735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 37349ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3735e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 37369566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 37379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 37389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 37391a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3740d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3741ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 3742a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3743a435da06SStefano Zampini #else 3744d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3745a435da06SStefano Zampini #endif 3746d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3747d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3748d8132acaSStefano Zampini #endif 37491a2c6b5cSJunchao Zhang B->spptr = spptr; 37509ae82921SPaul Mullowney } else { 3751e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3752e6e9a74fSStefano Zampini 37539566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 37549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 37559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3756e6e9a74fSStefano Zampini B->spptr = spptr; 37579ae82921SPaul Mullowney } 3758e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 375949735bf3SStefano Zampini } 3760693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 37619ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 37621a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 37639ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 376495639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3765693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 37662205254eSKarl Rupp 37679566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 37689566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 37699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3770ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 37719566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3772ae48a8d0SStefano Zampini #endif 37739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 37749ae82921SPaul Mullowney PetscFunctionReturn(0); 37759ae82921SPaul Mullowney } 37769ae82921SPaul Mullowney 37779371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) { 377802fe1965SBarry Smith PetscFunctionBegin; 37799566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 37809566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 378102fe1965SBarry Smith PetscFunctionReturn(0); 378202fe1965SBarry Smith } 378302fe1965SBarry Smith 37843ca39a21SBarry Smith /*MC 3785e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3786e057df02SPaul Mullowney 378711a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 378811a5261eSBarry Smith CSR, ELL, or Hybrid format. 378911a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3790e057df02SPaul Mullowney 3791e057df02SPaul Mullowney Options Database Keys: 379211a5261eSBarry Smith + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 379311a5261eSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 379411a5261eSBarry Smith - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 379511a5261eSBarry Smith + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3796e057df02SPaul Mullowney 3797e057df02SPaul Mullowney Level: beginner 3798e057df02SPaul Mullowney 379911a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3800e057df02SPaul Mullowney M*/ 38017f756511SDominic Meiser 3802bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 38030f39cd5aSBarry Smith 38049371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) { 380542c9c57cSBarry Smith PetscFunctionBegin; 38069566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 38079566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 38089566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 38099566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 38109566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3811bddcd29dSMark Adams 381242c9c57cSBarry Smith PetscFunctionReturn(0); 381342c9c57cSBarry Smith } 381429b38603SBarry Smith 38159371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) { 3816cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3817cbc6b225SStefano Zampini 3818cbc6b225SStefano Zampini PetscFunctionBegin; 3819cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3820cbc6b225SStefano Zampini delete cusp->cooPerm; 3821cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3822cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3823cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3824cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 38259566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 38269566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3827cbc6b225SStefano Zampini } 3828cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3829cbc6b225SStefano Zampini PetscFunctionReturn(0); 3830cbc6b225SStefano Zampini } 3831cbc6b225SStefano Zampini 38329371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) { 38337f756511SDominic Meiser PetscFunctionBegin; 38347f756511SDominic Meiser if (*cusparsestruct) { 38359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 38369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 38377f756511SDominic Meiser delete (*cusparsestruct)->workVector; 383881902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 38397e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 38407e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3841a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 38429566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 38439566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 38449566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 38459566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 38467f756511SDominic Meiser } 38477f756511SDominic Meiser PetscFunctionReturn(0); 38487f756511SDominic Meiser } 38497f756511SDominic Meiser 38509371c9d4SSatish Balay static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) { 38517f756511SDominic Meiser PetscFunctionBegin; 38527f756511SDominic Meiser if (*mat) { 38537f756511SDominic Meiser delete (*mat)->values; 38547f756511SDominic Meiser delete (*mat)->column_indices; 38557f756511SDominic Meiser delete (*mat)->row_offsets; 38567f756511SDominic Meiser delete *mat; 38577f756511SDominic Meiser *mat = 0; 38587f756511SDominic Meiser } 38597f756511SDominic Meiser PetscFunctionReturn(0); 38607f756511SDominic Meiser } 38617f756511SDominic Meiser 38629371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) { 38637f756511SDominic Meiser PetscFunctionBegin; 38647f756511SDominic Meiser if (*trifactor) { 38659566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3866261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 38679566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 38689566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 38699566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3870afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 38719566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3872afb2bd1cSJunchao Zhang #endif 38739566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 38747f756511SDominic Meiser } 38757f756511SDominic Meiser PetscFunctionReturn(0); 38767f756511SDominic Meiser } 38777f756511SDominic Meiser 38789371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) { 38797f756511SDominic Meiser CsrMatrix *mat; 38807f756511SDominic Meiser 38817f756511SDominic Meiser PetscFunctionBegin; 38827f756511SDominic Meiser if (*matstruct) { 38837f756511SDominic Meiser if ((*matstruct)->mat) { 38847f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3885afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3886afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3887afb2bd1cSJunchao Zhang #else 38887f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 38899566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3890afb2bd1cSJunchao Zhang #endif 38917f756511SDominic Meiser } else { 38927f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 38937f756511SDominic Meiser CsrMatrix_Destroy(&mat); 38947f756511SDominic Meiser } 38957f756511SDominic Meiser } 38969566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 38977f756511SDominic Meiser delete (*matstruct)->cprowIndices; 38989566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 38999566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 39009566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3901afb2bd1cSJunchao Zhang 3902afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3903afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 39049566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3905afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 3906afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 39079566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 39089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 39099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3910afb2bd1cSJunchao Zhang } 3911afb2bd1cSJunchao Zhang } 3912afb2bd1cSJunchao Zhang #endif 39137f756511SDominic Meiser delete *matstruct; 39147e8381f9SStefano Zampini *matstruct = NULL; 39157f756511SDominic Meiser } 39167f756511SDominic Meiser PetscFunctionReturn(0); 39177f756511SDominic Meiser } 39187f756511SDominic Meiser 39199371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) { 3920da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3921da112707SJunchao Zhang 39227f756511SDominic Meiser PetscFunctionBegin; 3923da112707SJunchao Zhang if (fs) { 3924da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3925da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3926da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3927da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3928da112707SJunchao Zhang delete fs->rpermIndices; 3929da112707SJunchao Zhang delete fs->cpermIndices; 3930da112707SJunchao Zhang delete fs->workVector; 3931da112707SJunchao Zhang fs->rpermIndices = NULL; 3932da112707SJunchao Zhang fs->cpermIndices = NULL; 3933da112707SJunchao Zhang fs->workVector = NULL; 3934da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3935da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3936da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 3937da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 3938da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3939da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 3940da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 3941da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 3942da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 394312ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3944da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3945da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 394612ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3947da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3948da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3949da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3950da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3951da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3952da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3953da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3954da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3955da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3956da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3957da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3958da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 395912ba2bc6SJunchao Zhang 396012ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 396112ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3962da112707SJunchao Zhang #endif 3963ccdfe979SStefano Zampini } 3964ccdfe979SStefano Zampini PetscFunctionReturn(0); 3965ccdfe979SStefano Zampini } 3966ccdfe979SStefano Zampini 39679371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) { 3968ccdfe979SStefano Zampini cusparseHandle_t handle; 3969ccdfe979SStefano Zampini 3970ccdfe979SStefano Zampini PetscFunctionBegin; 3971ccdfe979SStefano Zampini if (*trifactors) { 39729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 397348a46eb9SPierre Jolivet if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 39749566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 39757f756511SDominic Meiser } 39767f756511SDominic Meiser PetscFunctionReturn(0); 39777f756511SDominic Meiser } 39787e8381f9SStefano Zampini 39799371c9d4SSatish Balay struct IJCompare { 39809371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 39817e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 39827e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 39837e8381f9SStefano Zampini return false; 39847e8381f9SStefano Zampini } 39857e8381f9SStefano Zampini }; 39867e8381f9SStefano Zampini 39879371c9d4SSatish Balay struct IJEqual { 39889371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 39897e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 39907e8381f9SStefano Zampini return true; 39917e8381f9SStefano Zampini } 39927e8381f9SStefano Zampini }; 39937e8381f9SStefano Zampini 39949371c9d4SSatish Balay struct IJDiff { 39959371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 39967e8381f9SStefano Zampini }; 39977e8381f9SStefano Zampini 39989371c9d4SSatish Balay struct IJSum { 39999371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 40007e8381f9SStefano Zampini }; 40017e8381f9SStefano Zampini 40027e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 4003219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 40049371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 40057e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4006fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4007bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 400808391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 40097e8381f9SStefano Zampini CsrMatrix *matrix; 40107e8381f9SStefano Zampini PetscInt n; 40117e8381f9SStefano Zampini 40127e8381f9SStefano Zampini PetscFunctionBegin; 401328b400f6SJacob Faibussowitsch PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 401428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 40157e8381f9SStefano Zampini if (!cusp->cooPerm) { 40169566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 40179566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 40187e8381f9SStefano Zampini PetscFunctionReturn(0); 40197e8381f9SStefano Zampini } 40207e8381f9SStefano Zampini matrix = (CsrMatrix *)cusp->mat->mat; 402128b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4022e61fc153SStefano Zampini if (!v) { 4023e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4024e61fc153SStefano Zampini goto finalize; 40257e8381f9SStefano Zampini } 4026e61fc153SStefano Zampini n = cusp->cooPerm->size(); 402708391a17SStefano Zampini if (isCudaMem(v)) { 402808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 402908391a17SStefano Zampini } else { 4030e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4031e61fc153SStefano Zampini cooPerm_v->assign(v, v + n); 403208391a17SStefano Zampini d_v = cooPerm_v->data(); 40339566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 403408391a17SStefano Zampini } 40359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4036e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4037ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4038bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 403908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4040ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4041ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4042ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4043ddea5d60SJunchao Zhang */ 4044e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4045e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4046e61fc153SStefano Zampini delete cooPerm_w; 40477e8381f9SStefano Zampini } else { 4048ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 40499371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 40509371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4051ddea5d60SJunchao Zhang thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 40527e8381f9SStefano Zampini } 40537e8381f9SStefano Zampini } else { 4054e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 405508391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4056e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 40577e8381f9SStefano Zampini } else { 40589371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 40599371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 40607e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 40617e8381f9SStefano Zampini } 40627e8381f9SStefano Zampini } 40639566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4064e61fc153SStefano Zampini finalize: 4065e61fc153SStefano Zampini delete cooPerm_v; 40667e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 40679566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4068fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 40699566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 40709566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 40719566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4072fcdce8c4SStefano Zampini a->reallocs = 0; 4073fcdce8c4SStefano Zampini A->info.mallocs += 0; 4074fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4075fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4076fcdce8c4SStefano Zampini A->num_ass++; 40777e8381f9SStefano Zampini PetscFunctionReturn(0); 40787e8381f9SStefano Zampini } 40797e8381f9SStefano Zampini 40809371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) { 4081a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4082a49f1ed0SStefano Zampini 4083a49f1ed0SStefano Zampini PetscFunctionBegin; 4084a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4085a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4086a49f1ed0SStefano Zampini if (destroy) { 40879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4088a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4089a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4090a49f1ed0SStefano Zampini } 40911a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 4092a49f1ed0SStefano Zampini PetscFunctionReturn(0); 4093a49f1ed0SStefano Zampini } 4094a49f1ed0SStefano Zampini 40957e8381f9SStefano Zampini #include <thrust/binary_search.h> 4096219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 40979371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 40987e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 40997e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 41007e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 41017e8381f9SStefano Zampini 41027e8381f9SStefano Zampini PetscFunctionBegin; 41039566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 41049566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 41057e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 41067e8381f9SStefano Zampini if (n != cooPerm_n) { 41077e8381f9SStefano Zampini delete cusp->cooPerm; 41087e8381f9SStefano Zampini delete cusp->cooPerm_a; 41097e8381f9SStefano Zampini cusp->cooPerm = NULL; 41107e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 41117e8381f9SStefano Zampini } 41127e8381f9SStefano Zampini if (n) { 4113e8729f6fSJunchao Zhang thrust::device_ptr<PetscInt> d_i, d_j; 4114e8729f6fSJunchao Zhang PetscInt *d_raw_i, *d_raw_j; 4115e8729f6fSJunchao Zhang PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4116e8729f6fSJunchao Zhang PetscMemType imtype, jmtype; 4117e8729f6fSJunchao Zhang 4118e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_i, &imtype)); 4119e8729f6fSJunchao Zhang if (PetscMemTypeHost(imtype)) { 4120e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4121e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4122e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(d_raw_i); 4123e8729f6fSJunchao Zhang free_raw_i = PETSC_TRUE; 4124e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4125e8729f6fSJunchao Zhang } else { 4126e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(coo_i); 4127e8729f6fSJunchao Zhang } 4128e8729f6fSJunchao Zhang 4129e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_j, &jmtype)); 4130e8729f6fSJunchao Zhang if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4131e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4132e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4133e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(d_raw_j); 4134e8729f6fSJunchao Zhang free_raw_j = PETSC_TRUE; 4135e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4136e8729f6fSJunchao Zhang } else { 4137e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(coo_j); 4138e8729f6fSJunchao Zhang } 4139e8729f6fSJunchao Zhang 41407e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 41417e8381f9SStefano Zampini 4142ad540459SPierre Jolivet if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4143ad540459SPierre Jolivet if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 41447e8381f9SStefano Zampini 4145ddea5d60SJunchao Zhang /* Ex. 4146ddea5d60SJunchao Zhang n = 6 4147ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4148ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4149ddea5d60SJunchao Zhang */ 4150e8729f6fSJunchao Zhang auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4151e8729f6fSJunchao Zhang auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 41527e8381f9SStefano Zampini 41539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41547e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4155ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4156e8729f6fSJunchao Zhang (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4157e8729f6fSJunchao Zhang THRUSTINTARRAY w(d_j, d_j + n); 41587e8381f9SStefano Zampini 4159ddea5d60SJunchao Zhang /* 4160ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4161ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4162ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4163ddea5d60SJunchao Zhang */ 4164ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4165ddea5d60SJunchao Zhang 4166ddea5d60SJunchao Zhang /* 4167ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4168ddea5d60SJunchao Zhang ^ekey 4169ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4170ddea5d60SJunchao Zhang ^nekye 4171ddea5d60SJunchao Zhang */ 41727e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 41737e8381f9SStefano Zampini delete cusp->cooPerm_a; 41747e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4175ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4176ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4177ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4178ddea5d60SJunchao Zhang adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4179ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 41807e8381f9SStefano Zampini w[0] = 0; 4181ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4182ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 41837e8381f9SStefano Zampini } 41847e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4185e8729f6fSJunchao Zhang thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4186ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4187ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 41889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41897e8381f9SStefano Zampini 41909566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 41917e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 41927e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 41937e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 41949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4195ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 41969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 41977e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4198fcdce8c4SStefano Zampini a->rmax = 0; 41999566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->a)); 42009566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->j)); 4201e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 42029566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 42039566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 42047e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 42057e8381f9SStefano Zampini const PetscInt nnzr = a->i[i + 1] - a->i[i]; 42067e8381f9SStefano Zampini nzr += (PetscInt) !!(nnzr); 42077e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4208fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax, nnzr); 42097e8381f9SStefano Zampini } 4210fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 42117e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 42129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 42139566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4214e8729f6fSJunchao Zhang if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4215e8729f6fSJunchao Zhang if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 42167e8381f9SStefano Zampini } else { 42179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 42187e8381f9SStefano Zampini } 42199566063dSJacob Faibussowitsch PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 42207e8381f9SStefano Zampini 42217e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4222e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 42239566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->nz)); 42249566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 42257e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 42269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 42279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 42287e8381f9SStefano Zampini PetscFunctionReturn(0); 42297e8381f9SStefano Zampini } 4230ed502f03SStefano Zampini 42319371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 4232219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4233219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4234cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4235219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4236219fbbafSJunchao Zhang 4237219fbbafSJunchao Zhang PetscFunctionBegin; 42389566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 42399566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4240219fbbafSJunchao Zhang if (coo_i) { 42419566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 4242219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4243219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { 42449371c9d4SSatish Balay if (coo_i[k] < 0 || coo_j[k] < 0) { 42459371c9d4SSatish Balay coo_basic = PETSC_FALSE; 42469371c9d4SSatish Balay break; 42479371c9d4SSatish Balay } 4248219fbbafSJunchao Zhang } 4249219fbbafSJunchao Zhang } 4250219fbbafSJunchao Zhang } 4251219fbbafSJunchao Zhang 4252219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 42539566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4254219fbbafSJunchao Zhang } else { 42559566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4256cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 42579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4258219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(mat->data); 4259219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 42609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 42619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 42629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 42639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4264219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4265219fbbafSJunchao Zhang } 4266219fbbafSJunchao Zhang PetscFunctionReturn(0); 4267219fbbafSJunchao Zhang } 4268219fbbafSJunchao Zhang 42699371c9d4SSatish Balay __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) { 4270219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4271219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4272b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4273b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4274b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4275b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4276b6c38306SJunchao Zhang } 4277219fbbafSJunchao Zhang } 4278219fbbafSJunchao Zhang 42799371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) { 4280219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4281219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4282219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4283219fbbafSJunchao Zhang PetscMemType memtype; 4284219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4285219fbbafSJunchao Zhang PetscScalar *Aa; 4286219fbbafSJunchao Zhang 4287219fbbafSJunchao Zhang PetscFunctionBegin; 4288219fbbafSJunchao Zhang if (dev->use_extended_coo) { 42899566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4290219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 42919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 42929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4293219fbbafSJunchao Zhang } 4294219fbbafSJunchao Zhang 42959566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 42969566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4297219fbbafSJunchao Zhang 4298cbc6b225SStefano Zampini if (Annz) { 4299b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 43009566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4301cbc6b225SStefano Zampini } 4302219fbbafSJunchao Zhang 43039566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 43049566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4305219fbbafSJunchao Zhang 43069566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4307219fbbafSJunchao Zhang } else { 43089566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4309219fbbafSJunchao Zhang } 4310219fbbafSJunchao Zhang PetscFunctionReturn(0); 4311219fbbafSJunchao Zhang } 4312219fbbafSJunchao Zhang 43135b7e41feSStefano Zampini /*@C 431411a5261eSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 43155b7e41feSStefano Zampini 43165b7e41feSStefano Zampini Not collective 43175b7e41feSStefano Zampini 43185b7e41feSStefano Zampini Input Parameters: 43195b7e41feSStefano Zampini + A - the matrix 432011a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 43215b7e41feSStefano Zampini 43225b7e41feSStefano Zampini Output Parameters: 43235b7e41feSStefano Zampini + ia - the CSR row pointers 43245b7e41feSStefano Zampini - ja - the CSR column indices 43255b7e41feSStefano Zampini 43265b7e41feSStefano Zampini Level: developer 43275b7e41feSStefano Zampini 432811a5261eSBarry Smith Note: 43295b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 43305b7e41feSStefano Zampini 4331db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 43325b7e41feSStefano Zampini @*/ 43339371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 43345f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 43355f101d05SStefano Zampini CsrMatrix *csr; 43365f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 43375f101d05SStefano Zampini 43385f101d05SStefano Zampini PetscFunctionBegin; 43395f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 43405f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 43415f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4342aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 43439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 434428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 43455f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 43465f101d05SStefano Zampini if (i) { 43475f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 43485f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 43495f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 43505f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 43519566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 43525f101d05SStefano Zampini } 43535f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 43545f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 43555f101d05SStefano Zampini } 43565f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 43575f101d05SStefano Zampini PetscFunctionReturn(0); 43585f101d05SStefano Zampini } 43595f101d05SStefano Zampini 43605b7e41feSStefano Zampini /*@C 436111a5261eSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 43625b7e41feSStefano Zampini 43635b7e41feSStefano Zampini Not collective 43645b7e41feSStefano Zampini 43655b7e41feSStefano Zampini Input Parameters: 43665b7e41feSStefano Zampini + A - the matrix 436711a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 43685b7e41feSStefano Zampini 43695b7e41feSStefano Zampini Output Parameters: 43705b7e41feSStefano Zampini + ia - the CSR row pointers 43715b7e41feSStefano Zampini - ja - the CSR column indices 43725b7e41feSStefano Zampini 43735b7e41feSStefano Zampini Level: developer 43745b7e41feSStefano Zampini 4375db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 43765b7e41feSStefano Zampini @*/ 43779371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 43785f101d05SStefano Zampini PetscFunctionBegin; 43795f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 43805f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 43815f101d05SStefano Zampini if (i) *i = NULL; 43825f101d05SStefano Zampini if (j) *j = NULL; 43835f101d05SStefano Zampini PetscFunctionReturn(0); 43845f101d05SStefano Zampini } 43855f101d05SStefano Zampini 43865b7e41feSStefano Zampini /*@C 438711a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 43885b7e41feSStefano Zampini 43895b7e41feSStefano Zampini Not Collective 43905b7e41feSStefano Zampini 43915b7e41feSStefano Zampini Input Parameter: 439211a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 43935b7e41feSStefano Zampini 43945b7e41feSStefano Zampini Output Parameter: 43955b7e41feSStefano Zampini . a - pointer to the device data 43965b7e41feSStefano Zampini 43975b7e41feSStefano Zampini Level: developer 43985b7e41feSStefano Zampini 439911a5261eSBarry Smith Note: 440011a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 44015b7e41feSStefano Zampini 4402db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 44035b7e41feSStefano Zampini @*/ 44049371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) { 4405ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4406ed502f03SStefano Zampini CsrMatrix *csr; 4407ed502f03SStefano Zampini 4408ed502f03SStefano Zampini PetscFunctionBegin; 4409ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4410ed502f03SStefano Zampini PetscValidPointer(a, 2); 4411ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4412aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 441428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4415ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 441628b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4417ed502f03SStefano Zampini *a = csr->values->data().get(); 4418ed502f03SStefano Zampini PetscFunctionReturn(0); 4419ed502f03SStefano Zampini } 4420ed502f03SStefano Zampini 44215b7e41feSStefano Zampini /*@C 442211a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 44235b7e41feSStefano Zampini 44245b7e41feSStefano Zampini Not Collective 44255b7e41feSStefano Zampini 44265b7e41feSStefano Zampini Input Parameter: 442711a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 44285b7e41feSStefano Zampini 44295b7e41feSStefano Zampini Output Parameter: 44305b7e41feSStefano Zampini . a - pointer to the device data 44315b7e41feSStefano Zampini 44325b7e41feSStefano Zampini Level: developer 44335b7e41feSStefano Zampini 4434db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 44355b7e41feSStefano Zampini @*/ 44369371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) { 4437ed502f03SStefano Zampini PetscFunctionBegin; 4438ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4439ed502f03SStefano Zampini PetscValidPointer(a, 2); 4440ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4441ed502f03SStefano Zampini *a = NULL; 4442ed502f03SStefano Zampini PetscFunctionReturn(0); 4443ed502f03SStefano Zampini } 4444ed502f03SStefano Zampini 44455b7e41feSStefano Zampini /*@C 444611a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 44475b7e41feSStefano Zampini 44485b7e41feSStefano Zampini Not Collective 44495b7e41feSStefano Zampini 44505b7e41feSStefano Zampini Input Parameter: 445111a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 44525b7e41feSStefano Zampini 44535b7e41feSStefano Zampini Output Parameter: 44545b7e41feSStefano Zampini . a - pointer to the device data 44555b7e41feSStefano Zampini 44565b7e41feSStefano Zampini Level: developer 44575b7e41feSStefano Zampini 445811a5261eSBarry Smith Note: 445911a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 44605b7e41feSStefano Zampini 4461db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 44625b7e41feSStefano Zampini @*/ 44639371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) { 4464039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4465039c6fbaSStefano Zampini CsrMatrix *csr; 4466039c6fbaSStefano Zampini 4467039c6fbaSStefano Zampini PetscFunctionBegin; 4468039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4469039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4470039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4471aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 447328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4474039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 447528b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4476039c6fbaSStefano Zampini *a = csr->values->data().get(); 4477039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 44789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4479039c6fbaSStefano Zampini PetscFunctionReturn(0); 4480039c6fbaSStefano Zampini } 44815b7e41feSStefano Zampini /*@C 448211a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4483039c6fbaSStefano Zampini 44845b7e41feSStefano Zampini Not Collective 44855b7e41feSStefano Zampini 44865b7e41feSStefano Zampini Input Parameter: 448711a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 44885b7e41feSStefano Zampini 44895b7e41feSStefano Zampini Output Parameter: 44905b7e41feSStefano Zampini . a - pointer to the device data 44915b7e41feSStefano Zampini 44925b7e41feSStefano Zampini Level: developer 44935b7e41feSStefano Zampini 4494db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 44955b7e41feSStefano Zampini @*/ 44969371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) { 4497039c6fbaSStefano Zampini PetscFunctionBegin; 4498039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4499039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4500039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 45019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 45029566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4503039c6fbaSStefano Zampini *a = NULL; 4504039c6fbaSStefano Zampini PetscFunctionReturn(0); 4505039c6fbaSStefano Zampini } 4506039c6fbaSStefano Zampini 45075b7e41feSStefano Zampini /*@C 450811a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45095b7e41feSStefano Zampini 45105b7e41feSStefano Zampini Not Collective 45115b7e41feSStefano Zampini 45125b7e41feSStefano Zampini Input Parameter: 451311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45145b7e41feSStefano Zampini 45155b7e41feSStefano Zampini Output Parameter: 45165b7e41feSStefano Zampini . a - pointer to the device data 45175b7e41feSStefano Zampini 45185b7e41feSStefano Zampini Level: developer 45195b7e41feSStefano Zampini 452011a5261eSBarry Smith Note: 452111a5261eSBarry Smith Does not trigger host-device copies and flags data validity on the GPU 45225b7e41feSStefano Zampini 4523db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 45245b7e41feSStefano Zampini @*/ 45259371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) { 4526ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4527ed502f03SStefano Zampini CsrMatrix *csr; 4528ed502f03SStefano Zampini 4529ed502f03SStefano Zampini PetscFunctionBegin; 4530ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4531ed502f03SStefano Zampini PetscValidPointer(a, 2); 4532ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4533aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 453428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4535ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 453628b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4537ed502f03SStefano Zampini *a = csr->values->data().get(); 4538039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 45399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4540ed502f03SStefano Zampini PetscFunctionReturn(0); 4541ed502f03SStefano Zampini } 4542ed502f03SStefano Zampini 45435b7e41feSStefano Zampini /*@C 454411a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 45455b7e41feSStefano Zampini 45465b7e41feSStefano Zampini Not Collective 45475b7e41feSStefano Zampini 45485b7e41feSStefano Zampini Input Parameter: 454911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45505b7e41feSStefano Zampini 45515b7e41feSStefano Zampini Output Parameter: 45525b7e41feSStefano Zampini . a - pointer to the device data 45535b7e41feSStefano Zampini 45545b7e41feSStefano Zampini Level: developer 45555b7e41feSStefano Zampini 4556db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 45575b7e41feSStefano Zampini @*/ 45589371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) { 4559ed502f03SStefano Zampini PetscFunctionBegin; 4560ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4561ed502f03SStefano Zampini PetscValidPointer(a, 2); 4562ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 45639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 45649566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4565ed502f03SStefano Zampini *a = NULL; 4566ed502f03SStefano Zampini PetscFunctionReturn(0); 4567ed502f03SStefano Zampini } 4568ed502f03SStefano Zampini 45699371c9d4SSatish Balay struct IJCompare4 { 45709371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) { 4571ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4572ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4573ed502f03SStefano Zampini return false; 4574ed502f03SStefano Zampini } 4575ed502f03SStefano Zampini }; 4576ed502f03SStefano Zampini 45779371c9d4SSatish Balay struct Shift { 4578ed502f03SStefano Zampini int _shift; 4579ed502f03SStefano Zampini 4580ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 45819371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4582ed502f03SStefano Zampini }; 4583ed502f03SStefano Zampini 4584ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 45859371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 4586ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4587ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4588ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4589ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4590ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4591ed502f03SStefano Zampini cusparseStatus_t stat; 4592ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4593ed502f03SStefano Zampini 4594ed502f03SStefano Zampini PetscFunctionBegin; 4595ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4596ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4597ed502f03SStefano Zampini PetscValidPointer(C, 4); 4598ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4599ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 46005f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 460108401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4602aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4603aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4604ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4605ed502f03SStefano Zampini m = A->rmap->n; 4606ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 46079566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 46089566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 46099566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4610ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4611ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4612ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4613ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4614ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4615ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4616ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4617ed502f03SStefano Zampini c->compressedrow.i = NULL; 4618ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4619ed502f03SStefano Zampini Ccusp->workVector = NULL; 4620ed502f03SStefano Zampini Ccusp->nrows = m; 4621ed502f03SStefano Zampini Ccusp->mat = Cmat; 4622ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4623ed502f03SStefano Zampini Ccsr->num_rows = m; 4624ed502f03SStefano Zampini Ccsr->num_cols = n; 46259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 46269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 46279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 46289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 46299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 46309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 46319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 46359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 463628b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 463728b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4638ed502f03SStefano Zampini 4639ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4640ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4641ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4642ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4643ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4644ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4645ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4646ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4647ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4648ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4649ed502f03SStefano Zampini if (c->nz) { 46502ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 46512ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 46522ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 46532ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 46542ed87e7eSStefano Zampini 4655ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4656ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4657ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4658ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 46599566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4660ed502f03SStefano Zampini } 46612ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 46622ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4663ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4664ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4665ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4666ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 46679566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4668ed502f03SStefano Zampini } 46692ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 46702ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 46719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 46729371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 46739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 46749371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 46759371c9d4SSatish Balay PetscCallCUSPARSE(stat); 46762ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 46772ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 46782ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 46798909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4680ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4681ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 46828909a122SStefano Zampini #else 46838909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 46848909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 46858909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 46868909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 46878909a122SStefano Zampini #endif 46882ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 46892ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 46902ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 46912ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 46922ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 46932ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4694ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4695ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4696ed502f03SStefano Zampini thrust::advance(p2, Annz); 4697792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 46988909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 46998909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 47008909a122SStefano Zampini #endif 47012ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 47022ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 47032ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4704792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 47052ed87e7eSStefano Zampini #else 47062ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 4707792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4708792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 47092ed87e7eSStefano Zampini #endif 47109371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47119371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 47132ed87e7eSStefano Zampini delete wPerm; 47142ed87e7eSStefano Zampini delete Acoo; 47152ed87e7eSStefano Zampini delete Bcoo; 47162ed87e7eSStefano Zampini delete Ccoo; 4717ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 47189371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 47199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4720ed502f03SStefano Zampini #endif 47211a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 47229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 47239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4724ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4725ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4726ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4727ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4728ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4729ed502f03SStefano Zampini 47301a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 47311a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4732a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4733ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4734ed502f03SStefano Zampini CmatT->mat = CcsrT; 4735ed502f03SStefano Zampini CcsrT->num_rows = n; 4736ed502f03SStefano Zampini CcsrT->num_cols = m; 4737ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4738ed502f03SStefano Zampini 4739ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4740ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4741ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4742ed502f03SStefano Zampini 47439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4744ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4745ed502f03SStefano Zampini if (AT) { 4746ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4747ed502f03SStefano Zampini thrust::advance(rT, -1); 4748ed502f03SStefano Zampini } 4749ed502f03SStefano Zampini if (BT) { 4750ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4751ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4752ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4753ed502f03SStefano Zampini } 4754ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4755ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4756ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4757ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4758ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4759ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 47609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4761ed502f03SStefano Zampini 47629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 47639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 47649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 47659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 47669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 47679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 47689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4771ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 47729371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 47739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4774ed502f03SStefano Zampini #endif 4775ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4776ed502f03SStefano Zampini } 4777ed502f03SStefano Zampini } 4778ed502f03SStefano Zampini 4779ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4780ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4781ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 47829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 47839566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 4784ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4785ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4786ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4787ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4788ed502f03SStefano Zampini jj = *Ccsr->column_indices; 47899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 47909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4791ed502f03SStefano Zampini } else { 47929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 47939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4794ed502f03SStefano Zampini } 47959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 47969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 47979566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4798ed502f03SStefano Zampini c->maxnz = c->nz; 4799ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4800ed502f03SStefano Zampini c->rmax = 0; 4801ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4802ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4803ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4804ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 4805ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4806ed502f03SStefano Zampini } 48079566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 48089566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4809ed502f03SStefano Zampini (*C)->nonzerostate++; 48109566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 48119566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4812ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4813ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4814ed502f03SStefano Zampini } else { 481508401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4816ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4817ed502f03SStefano Zampini if (c->nz) { 4818ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 48195f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4820aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 482108401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 48229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 48239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 48245f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 48255f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4826ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4827ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4828ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4829aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4830aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4831aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4832aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 48335f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4834ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4835ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 48369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 48379371c9d4SSatish Balay auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 48389371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4839ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 48409371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 48419371c9d4SSatish Balay auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4842ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 48439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 48441a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 48455f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4846ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4847ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4848ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4849ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4850ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4851ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4852ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48531a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4854ed502f03SStefano Zampini } 48559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4856ed502f03SStefano Zampini } 4857ed502f03SStefano Zampini } 48589566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4859ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4860ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4861ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4862ed502f03SStefano Zampini PetscFunctionReturn(0); 4863ed502f03SStefano Zampini } 4864c215019aSStefano Zampini 48659371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) { 4866c215019aSStefano Zampini bool dmem; 4867c215019aSStefano Zampini const PetscScalar *av; 4868c215019aSStefano Zampini 4869c215019aSStefano Zampini PetscFunctionBegin; 4870c215019aSStefano Zampini dmem = isCudaMem(v); 48719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4872c215019aSStefano Zampini if (n && idx) { 4873c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4874c215019aSStefano Zampini widx.assign(idx, idx + n); 48759566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4876c215019aSStefano Zampini 4877c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4878c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4879c215019aSStefano Zampini if (dmem) { 4880c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4881c215019aSStefano Zampini } else { 4882c215019aSStefano Zampini w = new THRUSTARRAY(n); 4883c215019aSStefano Zampini dv = w->data(); 4884c215019aSStefano Zampini } 4885c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4886c215019aSStefano Zampini 4887c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4888c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4889c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 489048a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4891c215019aSStefano Zampini delete w; 4892c215019aSStefano Zampini } else { 48939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4894c215019aSStefano Zampini } 48959566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 48969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4897c215019aSStefano Zampini PetscFunctionReturn(0); 4898c215019aSStefano Zampini } 4899