1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/adjacent_difference.h> 16 #include <thrust/async/for_each.h> 17 #include <thrust/iterator/constant_iterator.h> 18 #include <thrust/remove.h> 19 #include <thrust/sort.h> 20 #include <thrust/unique.h> 21 22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26 27 typedef enum { 28 CUSPARSE_MV_ALG_DEFAULT = 0, 29 CUSPARSE_COOMV_ALG = 1, 30 CUSPARSE_CSRMV_ALG1 = 2, 31 CUSPARSE_CSRMV_ALG2 = 3 32 } cusparseSpMVAlg_t; 33 34 typedef enum { 35 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40 CUSPARSE_SPMM_ALG_DEFAULT = 0, 41 CUSPARSE_SPMM_COO_ALG1 = 1, 42 CUSPARSE_SPMM_COO_ALG2 = 2, 43 CUSPARSE_SPMM_COO_ALG3 = 3, 44 CUSPARSE_SPMM_COO_ALG4 = 5, 45 CUSPARSE_SPMM_CSR_ALG1 = 4, 46 CUSPARSE_SPMM_CSR_ALG2 = 6, 47 } cusparseSpMMAlg_t; 48 49 typedef enum { 50 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52 } cusparseCsr2CscAlg_t; 53 */ 54 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57 #endif 58 59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62 63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66 67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 81 82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 87 88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 90 91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94 95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 96 { 97 PetscFunctionBegin; 98 *type = MATSOLVERCUSPARSE; 99 PetscFunctionReturn(0); 100 } 101 102 /*MC 103 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 104 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 105 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 106 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 107 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 108 algorithms are not recommended. This class does NOT support direct solver operations. 109 110 Level: beginner 111 112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 113 M*/ 114 115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 116 { 117 PetscInt n = A->rmap->n; 118 119 PetscFunctionBegin; 120 PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 121 PetscCall(MatSetSizes(*B,n,n,n,n)); 122 (*B)->factortype = ftype; 123 PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 124 125 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 126 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 127 PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 128 if (!A->boundtocpu) { 129 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 130 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 131 } else { 132 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 133 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 134 } 135 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 136 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 137 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 138 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 139 if (!A->boundtocpu) { 140 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 141 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 142 } else { 143 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 144 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 145 } 146 PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 147 PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 148 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 149 150 PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 151 (*B)->canuseordering = PETSC_TRUE; 152 PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 153 PetscFunctionReturn(0); 154 } 155 156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 157 { 158 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 159 160 PetscFunctionBegin; 161 switch (op) { 162 case MAT_CUSPARSE_MULT: 163 cusparsestruct->format = format; 164 break; 165 case MAT_CUSPARSE_ALL: 166 cusparsestruct->format = format; 167 break; 168 default: 169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 170 } 171 PetscFunctionReturn(0); 172 } 173 174 /*@ 175 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 176 operation. Only the MatMult operation can use different GPU storage formats 177 for MPIAIJCUSPARSE matrices. 178 Not Collective 179 180 Input Parameters: 181 + A - Matrix of type SEQAIJCUSPARSE 182 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 183 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 184 185 Output Parameter: 186 187 Level: intermediate 188 189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 190 @*/ 191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 192 { 193 PetscFunctionBegin; 194 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 195 PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 196 PetscFunctionReturn(0); 197 } 198 199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 200 { 201 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 202 203 PetscFunctionBegin; 204 cusparsestruct->use_cpu_solve = use_cpu; 205 PetscFunctionReturn(0); 206 } 207 208 /*@ 209 MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 210 211 Input Parameters: 212 + A - Matrix of type SEQAIJCUSPARSE 213 - use_cpu - set flag for using the built-in CPU MatSolve 214 215 Output Parameter: 216 217 Notes: 218 The cuSparse LU solver currently computes the factors with the built-in CPU method 219 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 220 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 221 222 Level: intermediate 223 224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 225 @*/ 226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 227 { 228 PetscFunctionBegin; 229 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 230 PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 231 PetscFunctionReturn(0); 232 } 233 234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 235 { 236 PetscFunctionBegin; 237 switch (op) { 238 case MAT_FORM_EXPLICIT_TRANSPOSE: 239 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 240 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 241 A->form_explicit_transpose = flg; 242 break; 243 default: 244 PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 245 break; 246 } 247 PetscFunctionReturn(0); 248 } 249 250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 251 252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 253 { 254 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 255 IS isrow = b->row,iscol = b->col; 256 PetscBool row_identity,col_identity; 257 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 258 259 PetscFunctionBegin; 260 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 261 PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 262 B->offloadmask = PETSC_OFFLOAD_CPU; 263 /* determine which version of MatSolve needs to be used. */ 264 PetscCall(ISIdentity(isrow,&row_identity)); 265 PetscCall(ISIdentity(iscol,&col_identity)); 266 267 if (!cusparsestruct->use_cpu_solve) { 268 if (row_identity && col_identity) { 269 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 270 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 271 } else { 272 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 273 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 274 } 275 } 276 B->ops->matsolve = NULL; 277 B->ops->matsolvetranspose = NULL; 278 279 /* get the triangular factors */ 280 if (!cusparsestruct->use_cpu_solve) { 281 PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 282 } 283 PetscFunctionReturn(0); 284 } 285 286 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 287 { 288 MatCUSPARSEStorageFormat format; 289 PetscBool flg; 290 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 291 292 PetscFunctionBegin; 293 PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 294 if (A->factortype == MAT_FACTOR_NONE) { 295 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 297 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 298 299 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 300 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 301 if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 302 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 303 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 305 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 306 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 307 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 308 #if CUSPARSE_VERSION > 11301 309 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 310 #else 311 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 312 #endif 313 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 314 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 315 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 316 317 PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 318 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 319 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 320 #endif 321 } 322 PetscOptionsHeadEnd(); 323 PetscFunctionReturn(0); 324 } 325 326 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 327 { 328 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 329 330 PetscFunctionBegin; 331 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 332 PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 333 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 338 { 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 340 341 PetscFunctionBegin; 342 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 343 PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 344 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 345 PetscFunctionReturn(0); 346 } 347 348 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 349 { 350 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 351 352 PetscFunctionBegin; 353 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 354 PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 355 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 356 PetscFunctionReturn(0); 357 } 358 359 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 360 { 361 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 362 363 PetscFunctionBegin; 364 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 365 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 366 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 367 PetscFunctionReturn(0); 368 } 369 370 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 371 { 372 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 373 PetscInt n = A->rmap->n; 374 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 375 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 376 const PetscInt *ai = a->i,*aj = a->j,*vi; 377 const MatScalar *aa = a->a,*v; 378 PetscInt *AiLo, *AjLo; 379 PetscInt i,nz, nzLower, offset, rowOffset; 380 381 PetscFunctionBegin; 382 if (!n) PetscFunctionReturn(0); 383 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 384 try { 385 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 386 nzLower=n+ai[n]-ai[1]; 387 if (!loTriFactor) { 388 PetscScalar *AALo; 389 390 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 391 392 /* Allocate Space for the lower triangular matrix */ 393 PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 394 PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 395 396 /* Fill the lower triangular matrix */ 397 AiLo[0] = (PetscInt) 0; 398 AiLo[n] = nzLower; 399 AjLo[0] = (PetscInt) 0; 400 AALo[0] = (MatScalar) 1.0; 401 v = aa; 402 vi = aj; 403 offset = 1; 404 rowOffset= 1; 405 for (i=1; i<n; i++) { 406 nz = ai[i+1] - ai[i]; 407 /* additional 1 for the term on the diagonal */ 408 AiLo[i] = rowOffset; 409 rowOffset += nz+1; 410 411 PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 412 PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 413 414 offset += nz; 415 AjLo[offset] = (PetscInt) i; 416 AALo[offset] = (MatScalar) 1.0; 417 offset += 1; 418 419 v += nz; 420 vi += nz; 421 } 422 423 /* allocate space for the triangular factor information */ 424 PetscCall(PetscNew(&loTriFactor)); 425 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 426 /* Create the matrix description */ 427 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 428 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 429 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 430 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 431 #else 432 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 433 #endif 434 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 435 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 436 437 /* set the operation */ 438 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 439 440 /* set the matrix */ 441 loTriFactor->csrMat = new CsrMatrix; 442 loTriFactor->csrMat->num_rows = n; 443 loTriFactor->csrMat->num_cols = n; 444 loTriFactor->csrMat->num_entries = nzLower; 445 446 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 447 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 448 449 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 450 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 451 452 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 453 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 454 455 /* Create the solve analysis information */ 456 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 457 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 458 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 459 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 460 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 461 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 462 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 463 &loTriFactor->solveBufferSize)); 464 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 465 #endif 466 467 /* perform the solve analysis */ 468 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 469 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 470 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 471 loTriFactor->csrMat->column_indices->data().get(), 472 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 473 loTriFactor->solveInfo, 474 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 475 #else 476 loTriFactor->solveInfo)); 477 #endif 478 PetscCallCUDA(WaitForCUDA()); 479 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 480 481 /* assign the pointer */ 482 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 483 loTriFactor->AA_h = AALo; 484 PetscCallCUDA(cudaFreeHost(AiLo)); 485 PetscCallCUDA(cudaFreeHost(AjLo)); 486 PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 487 } else { /* update values only */ 488 if (!loTriFactor->AA_h) { 489 PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 490 } 491 /* Fill the lower triangular matrix */ 492 loTriFactor->AA_h[0] = 1.0; 493 v = aa; 494 vi = aj; 495 offset = 1; 496 for (i=1; i<n; i++) { 497 nz = ai[i+1] - ai[i]; 498 PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 499 offset += nz; 500 loTriFactor->AA_h[offset] = 1.0; 501 offset += 1; 502 v += nz; 503 } 504 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 505 PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 506 } 507 } catch(char *ex) { 508 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 509 } 510 } 511 PetscFunctionReturn(0); 512 } 513 514 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 515 { 516 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 517 PetscInt n = A->rmap->n; 518 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 519 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 520 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 521 const MatScalar *aa = a->a,*v; 522 PetscInt *AiUp, *AjUp; 523 PetscInt i,nz, nzUpper, offset; 524 525 PetscFunctionBegin; 526 if (!n) PetscFunctionReturn(0); 527 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 528 try { 529 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 530 nzUpper = adiag[0]-adiag[n]; 531 if (!upTriFactor) { 532 PetscScalar *AAUp; 533 534 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 535 536 /* Allocate Space for the upper triangular matrix */ 537 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 538 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 539 540 /* Fill the upper triangular matrix */ 541 AiUp[0]=(PetscInt) 0; 542 AiUp[n]=nzUpper; 543 offset = nzUpper; 544 for (i=n-1; i>=0; i--) { 545 v = aa + adiag[i+1] + 1; 546 vi = aj + adiag[i+1] + 1; 547 548 /* number of elements NOT on the diagonal */ 549 nz = adiag[i] - adiag[i+1]-1; 550 551 /* decrement the offset */ 552 offset -= (nz+1); 553 554 /* first, set the diagonal elements */ 555 AjUp[offset] = (PetscInt) i; 556 AAUp[offset] = (MatScalar)1./v[nz]; 557 AiUp[i] = AiUp[i+1] - (nz+1); 558 559 PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 560 PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 561 } 562 563 /* allocate space for the triangular factor information */ 564 PetscCall(PetscNew(&upTriFactor)); 565 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 566 567 /* Create the matrix description */ 568 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 569 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 570 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 571 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 572 #else 573 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 574 #endif 575 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 576 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 577 578 /* set the operation */ 579 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 580 581 /* set the matrix */ 582 upTriFactor->csrMat = new CsrMatrix; 583 upTriFactor->csrMat->num_rows = n; 584 upTriFactor->csrMat->num_cols = n; 585 upTriFactor->csrMat->num_entries = nzUpper; 586 587 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 588 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 589 590 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 591 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 592 593 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 594 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 595 596 /* Create the solve analysis information */ 597 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 598 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 599 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 600 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 601 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 602 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 603 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 604 &upTriFactor->solveBufferSize)); 605 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 606 #endif 607 608 /* perform the solve analysis */ 609 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 610 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 611 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 612 upTriFactor->csrMat->column_indices->data().get(), 613 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 614 upTriFactor->solveInfo, 615 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 616 #else 617 upTriFactor->solveInfo)); 618 #endif 619 PetscCallCUDA(WaitForCUDA()); 620 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 621 622 /* assign the pointer */ 623 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 624 upTriFactor->AA_h = AAUp; 625 PetscCallCUDA(cudaFreeHost(AiUp)); 626 PetscCallCUDA(cudaFreeHost(AjUp)); 627 PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 628 } else { 629 if (!upTriFactor->AA_h) { 630 PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 631 } 632 /* Fill the upper triangular matrix */ 633 offset = nzUpper; 634 for (i=n-1; i>=0; i--) { 635 v = aa + adiag[i+1] + 1; 636 637 /* number of elements NOT on the diagonal */ 638 nz = adiag[i] - adiag[i+1]-1; 639 640 /* decrement the offset */ 641 offset -= (nz+1); 642 643 /* first, set the diagonal elements */ 644 upTriFactor->AA_h[offset] = 1./v[nz]; 645 PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 646 } 647 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 648 PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 649 } 650 } catch(char *ex) { 651 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 652 } 653 } 654 PetscFunctionReturn(0); 655 } 656 657 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 658 { 659 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 660 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 661 IS isrow = a->row,iscol = a->icol; 662 PetscBool row_identity,col_identity; 663 PetscInt n = A->rmap->n; 664 665 PetscFunctionBegin; 666 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 667 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 668 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 669 670 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 671 cusparseTriFactors->nnz=a->nz; 672 673 A->offloadmask = PETSC_OFFLOAD_BOTH; 674 /* lower triangular indices */ 675 PetscCall(ISIdentity(isrow,&row_identity)); 676 if (!row_identity && !cusparseTriFactors->rpermIndices) { 677 const PetscInt *r; 678 679 PetscCall(ISGetIndices(isrow,&r)); 680 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 681 cusparseTriFactors->rpermIndices->assign(r, r+n); 682 PetscCall(ISRestoreIndices(isrow,&r)); 683 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 684 } 685 686 /* upper triangular indices */ 687 PetscCall(ISIdentity(iscol,&col_identity)); 688 if (!col_identity && !cusparseTriFactors->cpermIndices) { 689 const PetscInt *c; 690 691 PetscCall(ISGetIndices(iscol,&c)); 692 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 693 cusparseTriFactors->cpermIndices->assign(c, c+n); 694 PetscCall(ISRestoreIndices(iscol,&c)); 695 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 696 } 697 PetscFunctionReturn(0); 698 } 699 700 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 701 { 702 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 703 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 704 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 705 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 706 PetscInt *AiUp, *AjUp; 707 PetscScalar *AAUp; 708 PetscScalar *AALo; 709 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 710 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 711 const PetscInt *ai = b->i,*aj = b->j,*vj; 712 const MatScalar *aa = b->a,*v; 713 714 PetscFunctionBegin; 715 if (!n) PetscFunctionReturn(0); 716 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 717 try { 718 PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 719 PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 720 if (!upTriFactor && !loTriFactor) { 721 /* Allocate Space for the upper triangular matrix */ 722 PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 723 PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 724 725 /* Fill the upper triangular matrix */ 726 AiUp[0]=(PetscInt) 0; 727 AiUp[n]=nzUpper; 728 offset = 0; 729 for (i=0; i<n; i++) { 730 /* set the pointers */ 731 v = aa + ai[i]; 732 vj = aj + ai[i]; 733 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 734 735 /* first, set the diagonal elements */ 736 AjUp[offset] = (PetscInt) i; 737 AAUp[offset] = (MatScalar)1.0/v[nz]; 738 AiUp[i] = offset; 739 AALo[offset] = (MatScalar)1.0/v[nz]; 740 741 offset+=1; 742 if (nz>0) { 743 PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 744 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 745 for (j=offset; j<offset+nz; j++) { 746 AAUp[j] = -AAUp[j]; 747 AALo[j] = AAUp[j]/v[nz]; 748 } 749 offset+=nz; 750 } 751 } 752 753 /* allocate space for the triangular factor information */ 754 PetscCall(PetscNew(&upTriFactor)); 755 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 756 757 /* Create the matrix description */ 758 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 759 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 760 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 762 #else 763 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 764 #endif 765 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 766 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 767 768 /* set the matrix */ 769 upTriFactor->csrMat = new CsrMatrix; 770 upTriFactor->csrMat->num_rows = A->rmap->n; 771 upTriFactor->csrMat->num_cols = A->cmap->n; 772 upTriFactor->csrMat->num_entries = a->nz; 773 774 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 775 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 776 777 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 778 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 779 780 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 781 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 782 783 /* set the operation */ 784 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 785 786 /* Create the solve analysis information */ 787 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 788 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo)); 789 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 790 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 791 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 792 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 793 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 794 &upTriFactor->solveBufferSize)); 795 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 796 #endif 797 798 /* perform the solve analysis */ 799 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 800 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 801 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 802 upTriFactor->csrMat->column_indices->data().get(), 803 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804 upTriFactor->solveInfo, 805 upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 806 #else 807 upTriFactor->solveInfo)); 808 #endif 809 PetscCallCUDA(WaitForCUDA()); 810 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 811 812 /* assign the pointer */ 813 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 814 815 /* allocate space for the triangular factor information */ 816 PetscCall(PetscNew(&loTriFactor)); 817 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 818 819 /* Create the matrix description */ 820 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 821 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 822 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 823 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 824 #else 825 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 826 #endif 827 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 828 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 829 830 /* set the operation */ 831 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 832 833 /* set the matrix */ 834 loTriFactor->csrMat = new CsrMatrix; 835 loTriFactor->csrMat->num_rows = A->rmap->n; 836 loTriFactor->csrMat->num_cols = A->cmap->n; 837 loTriFactor->csrMat->num_entries = a->nz; 838 839 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 840 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 841 842 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 843 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 844 845 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 846 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 847 848 /* Create the solve analysis information */ 849 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 850 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo)); 851 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 852 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 853 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 854 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 855 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 856 &loTriFactor->solveBufferSize)); 857 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 858 #endif 859 860 /* perform the solve analysis */ 861 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 862 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 863 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 864 loTriFactor->csrMat->column_indices->data().get(), 865 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 866 loTriFactor->solveInfo, 867 loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 868 #else 869 loTriFactor->solveInfo)); 870 #endif 871 PetscCallCUDA(WaitForCUDA()); 872 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 873 874 /* assign the pointer */ 875 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 876 877 PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 878 PetscCallCUDA(cudaFreeHost(AiUp)); 879 PetscCallCUDA(cudaFreeHost(AjUp)); 880 } else { 881 /* Fill the upper triangular matrix */ 882 offset = 0; 883 for (i=0; i<n; i++) { 884 /* set the pointers */ 885 v = aa + ai[i]; 886 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 887 888 /* first, set the diagonal elements */ 889 AAUp[offset] = 1.0/v[nz]; 890 AALo[offset] = 1.0/v[nz]; 891 892 offset+=1; 893 if (nz>0) { 894 PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 895 for (j=offset; j<offset+nz; j++) { 896 AAUp[j] = -AAUp[j]; 897 AALo[j] = AAUp[j]/v[nz]; 898 } 899 offset+=nz; 900 } 901 } 902 PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 903 PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 904 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 905 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 906 PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 907 } 908 PetscCallCUDA(cudaFreeHost(AAUp)); 909 PetscCallCUDA(cudaFreeHost(AALo)); 910 } catch(char *ex) { 911 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 912 } 913 } 914 PetscFunctionReturn(0); 915 } 916 917 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 918 { 919 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 920 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 921 IS ip = a->row; 922 PetscBool perm_identity; 923 PetscInt n = A->rmap->n; 924 925 PetscFunctionBegin; 926 PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 927 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 928 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 929 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 930 931 A->offloadmask = PETSC_OFFLOAD_BOTH; 932 933 /* lower triangular indices */ 934 PetscCall(ISIdentity(ip,&perm_identity)); 935 if (!perm_identity) { 936 IS iip; 937 const PetscInt *irip,*rip; 938 939 PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 940 PetscCall(ISGetIndices(iip,&irip)); 941 PetscCall(ISGetIndices(ip,&rip)); 942 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 943 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 944 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 945 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 946 PetscCall(ISRestoreIndices(iip,&irip)); 947 PetscCall(ISDestroy(&iip)); 948 PetscCall(ISRestoreIndices(ip,&rip)); 949 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 950 } 951 PetscFunctionReturn(0); 952 } 953 954 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 955 { 956 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 957 IS ip = b->row; 958 PetscBool perm_identity; 959 960 PetscFunctionBegin; 961 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 962 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 963 B->offloadmask = PETSC_OFFLOAD_CPU; 964 /* determine which version of MatSolve needs to be used. */ 965 PetscCall(ISIdentity(ip,&perm_identity)); 966 if (perm_identity) { 967 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 968 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 969 B->ops->matsolve = NULL; 970 B->ops->matsolvetranspose = NULL; 971 } else { 972 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 973 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 974 B->ops->matsolve = NULL; 975 B->ops->matsolvetranspose = NULL; 976 } 977 978 /* get the triangular factors */ 979 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 980 PetscFunctionReturn(0); 981 } 982 983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 984 { 985 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 986 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 987 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 988 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 989 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 990 cusparseIndexBase_t indexBase; 991 cusparseMatrixType_t matrixType; 992 cusparseFillMode_t fillMode; 993 cusparseDiagType_t diagType; 994 995 PetscFunctionBegin; 996 /* allocate space for the transpose of the lower triangular factor */ 997 PetscCall(PetscNew(&loTriFactorT)); 998 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 999 1000 /* set the matrix descriptors of the lower triangular factor */ 1001 matrixType = cusparseGetMatType(loTriFactor->descr); 1002 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1003 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1004 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1005 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1006 1007 /* Create the matrix description */ 1008 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1009 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1010 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1011 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1012 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1013 1014 /* set the operation */ 1015 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1016 1017 /* allocate GPU space for the CSC of the lower triangular factor*/ 1018 loTriFactorT->csrMat = new CsrMatrix; 1019 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1020 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1021 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1022 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1023 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1024 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1025 1026 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1028 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1029 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1030 loTriFactor->csrMat->values->data().get(), 1031 loTriFactor->csrMat->row_offsets->data().get(), 1032 loTriFactor->csrMat->column_indices->data().get(), 1033 loTriFactorT->csrMat->values->data().get(), 1034 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1035 CUSPARSE_ACTION_NUMERIC,indexBase, 1036 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1037 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 1038 #endif 1039 1040 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1041 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043 loTriFactor->csrMat->values->data().get(), 1044 loTriFactor->csrMat->row_offsets->data().get(), 1045 loTriFactor->csrMat->column_indices->data().get(), 1046 loTriFactorT->csrMat->values->data().get(), 1047 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1048 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1049 CUSPARSE_ACTION_NUMERIC, indexBase, 1050 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 1051 #else 1052 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1053 CUSPARSE_ACTION_NUMERIC, indexBase)); 1054 #endif 1055 PetscCallCUDA(WaitForCUDA()); 1056 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1057 1058 /* Create the solve analysis information */ 1059 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1060 PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo)); 1061 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1062 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1063 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1064 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1065 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1066 &loTriFactorT->solveBufferSize)); 1067 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1068 #endif 1069 1070 /* perform the solve analysis */ 1071 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1072 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1073 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1074 loTriFactorT->csrMat->column_indices->data().get(), 1075 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1076 loTriFactorT->solveInfo, 1077 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1078 #else 1079 loTriFactorT->solveInfo)); 1080 #endif 1081 PetscCallCUDA(WaitForCUDA()); 1082 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1083 1084 /* assign the pointer */ 1085 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1086 1087 /*********************************************/ 1088 /* Now the Transpose of the Upper Tri Factor */ 1089 /*********************************************/ 1090 1091 /* allocate space for the transpose of the upper triangular factor */ 1092 PetscCall(PetscNew(&upTriFactorT)); 1093 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1094 1095 /* set the matrix descriptors of the upper triangular factor */ 1096 matrixType = cusparseGetMatType(upTriFactor->descr); 1097 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1098 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1099 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1100 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1101 1102 /* Create the matrix description */ 1103 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1104 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1105 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1106 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1107 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1108 1109 /* set the operation */ 1110 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1111 1112 /* allocate GPU space for the CSC of the upper triangular factor*/ 1113 upTriFactorT->csrMat = new CsrMatrix; 1114 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1115 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1116 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1117 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1118 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1119 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1120 1121 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1122 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1123 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1124 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1125 upTriFactor->csrMat->values->data().get(), 1126 upTriFactor->csrMat->row_offsets->data().get(), 1127 upTriFactor->csrMat->column_indices->data().get(), 1128 upTriFactorT->csrMat->values->data().get(), 1129 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1130 CUSPARSE_ACTION_NUMERIC,indexBase, 1131 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1132 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1133 #endif 1134 1135 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1136 PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1137 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138 upTriFactor->csrMat->values->data().get(), 1139 upTriFactor->csrMat->row_offsets->data().get(), 1140 upTriFactor->csrMat->column_indices->data().get(), 1141 upTriFactorT->csrMat->values->data().get(), 1142 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1143 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144 CUSPARSE_ACTION_NUMERIC, indexBase, 1145 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1146 #else 1147 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1148 CUSPARSE_ACTION_NUMERIC, indexBase)); 1149 #endif 1150 1151 PetscCallCUDA(WaitForCUDA()); 1152 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1153 1154 /* Create the solve analysis information */ 1155 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1156 PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo)); 1157 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1158 PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1159 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1160 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1162 &upTriFactorT->solveBufferSize)); 1163 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1164 #endif 1165 1166 /* perform the solve analysis */ 1167 /* christ, would it have killed you to put this stuff in a function????????? */ 1168 PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1169 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1170 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1171 upTriFactorT->csrMat->column_indices->data().get(), 1172 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1173 upTriFactorT->solveInfo, 1174 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1175 #else 1176 upTriFactorT->solveInfo)); 1177 #endif 1178 1179 PetscCallCUDA(WaitForCUDA()); 1180 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1181 1182 /* assign the pointer */ 1183 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1184 PetscFunctionReturn(0); 1185 } 1186 1187 struct PetscScalarToPetscInt 1188 { 1189 __host__ __device__ 1190 PetscInt operator()(PetscScalar s) 1191 { 1192 return (PetscInt)PetscRealPart(s); 1193 } 1194 }; 1195 1196 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1197 { 1198 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1199 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1200 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1201 cusparseStatus_t stat; 1202 cusparseIndexBase_t indexBase; 1203 1204 PetscFunctionBegin; 1205 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1206 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1207 PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1208 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1209 PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1210 if (A->transupdated) PetscFunctionReturn(0); 1211 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1212 PetscCall(PetscLogGpuTimeBegin()); 1213 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1214 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1215 } 1216 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1217 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1218 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1219 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1220 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1221 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1222 1223 /* set alpha and beta */ 1224 PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 1225 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 1226 PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 1227 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1228 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1229 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1230 1231 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1232 CsrMatrix *matrixT = new CsrMatrix; 1233 matstructT->mat = matrixT; 1234 matrixT->num_rows = A->cmap->n; 1235 matrixT->num_cols = A->rmap->n; 1236 matrixT->num_entries = a->nz; 1237 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1238 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1239 matrixT->values = new THRUSTARRAY(a->nz); 1240 1241 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1242 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1243 1244 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1245 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1246 stat = cusparseCreateCsr(&matstructT->matDescr, 1247 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1248 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1249 matrixT->values->data().get(), 1250 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1251 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1252 #else 1253 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1254 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1255 1256 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1257 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1258 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1259 */ 1260 if (matrixT->num_entries) { 1261 stat = cusparseCreateCsr(&matstructT->matDescr, 1262 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1263 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1264 matrixT->values->data().get(), 1265 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1266 indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 1267 1268 } else { 1269 matstructT->matDescr = NULL; 1270 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1271 } 1272 #endif 1273 #endif 1274 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1275 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1276 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1277 #else 1278 CsrMatrix *temp = new CsrMatrix; 1279 CsrMatrix *tempT = new CsrMatrix; 1280 /* First convert HYB to CSR */ 1281 temp->num_rows = A->rmap->n; 1282 temp->num_cols = A->cmap->n; 1283 temp->num_entries = a->nz; 1284 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1285 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1286 temp->values = new THRUSTARRAY(a->nz); 1287 1288 stat = cusparse_hyb2csr(cusparsestruct->handle, 1289 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1290 temp->values->data().get(), 1291 temp->row_offsets->data().get(), 1292 temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1293 1294 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1295 tempT->num_rows = A->rmap->n; 1296 tempT->num_cols = A->cmap->n; 1297 tempT->num_entries = a->nz; 1298 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1299 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1300 tempT->values = new THRUSTARRAY(a->nz); 1301 1302 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1303 temp->num_cols, temp->num_entries, 1304 temp->values->data().get(), 1305 temp->row_offsets->data().get(), 1306 temp->column_indices->data().get(), 1307 tempT->values->data().get(), 1308 tempT->column_indices->data().get(), 1309 tempT->row_offsets->data().get(), 1310 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1311 1312 /* Last, convert CSC to HYB */ 1313 cusparseHybMat_t hybMat; 1314 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1315 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1316 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1317 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1318 matstructT->descr, tempT->values->data().get(), 1319 tempT->row_offsets->data().get(), 1320 tempT->column_indices->data().get(), 1321 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1322 1323 /* assign the pointer */ 1324 matstructT->mat = hybMat; 1325 A->transupdated = PETSC_TRUE; 1326 /* delete temporaries */ 1327 if (tempT) { 1328 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1329 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1330 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1331 delete (CsrMatrix*) tempT; 1332 } 1333 if (temp) { 1334 if (temp->values) delete (THRUSTARRAY*) temp->values; 1335 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1336 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1337 delete (CsrMatrix*) temp; 1338 } 1339 #endif 1340 } 1341 } 1342 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1343 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1344 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1345 PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1346 PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1347 PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1348 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1349 PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1350 PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1351 PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1352 PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1353 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1354 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1355 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1356 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1357 } 1358 if (!cusparsestruct->csr2csc_i) { 1359 THRUSTARRAY csr2csc_a(matrix->num_entries); 1360 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1361 1362 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1363 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1364 void *csr2cscBuffer; 1365 size_t csr2cscBufferSize; 1366 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1367 A->cmap->n, matrix->num_entries, 1368 matrix->values->data().get(), 1369 cusparsestruct->rowoffsets_gpu->data().get(), 1370 matrix->column_indices->data().get(), 1371 matrixT->values->data().get(), 1372 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1373 CUSPARSE_ACTION_NUMERIC,indexBase, 1374 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 1375 PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1376 #endif 1377 1378 if (matrix->num_entries) { 1379 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1380 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1381 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1382 1383 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1384 should be filled with indexBase. So I just take a shortcut here. 1385 */ 1386 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1387 A->cmap->n,matrix->num_entries, 1388 csr2csc_a.data().get(), 1389 cusparsestruct->rowoffsets_gpu->data().get(), 1390 matrix->column_indices->data().get(), 1391 matrixT->values->data().get(), 1392 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1393 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1394 CUSPARSE_ACTION_NUMERIC,indexBase, 1395 cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1396 #else 1397 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1398 CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1399 #endif 1400 } else { 1401 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1402 } 1403 1404 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1405 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1406 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1408 #endif 1409 } 1410 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1411 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1412 matrixT->values->begin())); 1413 } 1414 PetscCall(PetscLogGpuTimeEnd()); 1415 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1416 /* the compressed row indices is not used for matTranspose */ 1417 matstructT->cprowIndices = NULL; 1418 /* assign the pointer */ 1419 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1420 A->transupdated = PETSC_TRUE; 1421 PetscFunctionReturn(0); 1422 } 1423 1424 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1425 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1426 { 1427 PetscInt n = xx->map->n; 1428 const PetscScalar *barray; 1429 PetscScalar *xarray; 1430 thrust::device_ptr<const PetscScalar> bGPU; 1431 thrust::device_ptr<PetscScalar> xGPU; 1432 cusparseStatus_t stat; 1433 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1434 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1435 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1436 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1437 1438 PetscFunctionBegin; 1439 /* Analyze the matrix and create the transpose ... on the fly */ 1440 if (!loTriFactorT && !upTriFactorT) { 1441 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1442 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1443 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1444 } 1445 1446 /* Get the GPU pointers */ 1447 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1448 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1449 xGPU = thrust::device_pointer_cast(xarray); 1450 bGPU = thrust::device_pointer_cast(barray); 1451 1452 PetscCall(PetscLogGpuTimeBegin()); 1453 /* First, reorder with the row permutation */ 1454 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1455 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1456 xGPU); 1457 1458 /* First, solve U */ 1459 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1460 upTriFactorT->csrMat->num_rows, 1461 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462 upTriFactorT->csrMat->num_entries, 1463 #endif 1464 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1465 upTriFactorT->csrMat->values->data().get(), 1466 upTriFactorT->csrMat->row_offsets->data().get(), 1467 upTriFactorT->csrMat->column_indices->data().get(), 1468 upTriFactorT->solveInfo, 1469 xarray, 1470 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1471 tempGPU->data().get(), 1472 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1473 #else 1474 tempGPU->data().get());PetscCallCUSPARSE(stat); 1475 #endif 1476 1477 /* Then, solve L */ 1478 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1479 loTriFactorT->csrMat->num_rows, 1480 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481 loTriFactorT->csrMat->num_entries, 1482 #endif 1483 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1484 loTriFactorT->csrMat->values->data().get(), 1485 loTriFactorT->csrMat->row_offsets->data().get(), 1486 loTriFactorT->csrMat->column_indices->data().get(), 1487 loTriFactorT->solveInfo, 1488 tempGPU->data().get(), 1489 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1490 xarray, 1491 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1492 #else 1493 xarray);PetscCallCUSPARSE(stat); 1494 #endif 1495 1496 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1497 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1498 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1499 tempGPU->begin()); 1500 1501 /* Copy the temporary to the full solution. */ 1502 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1503 1504 /* restore */ 1505 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1506 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1507 PetscCall(PetscLogGpuTimeEnd()); 1508 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1509 PetscFunctionReturn(0); 1510 } 1511 1512 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1513 { 1514 const PetscScalar *barray; 1515 PetscScalar *xarray; 1516 cusparseStatus_t stat; 1517 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1518 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1519 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1520 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1521 1522 PetscFunctionBegin; 1523 /* Analyze the matrix and create the transpose ... on the fly */ 1524 if (!loTriFactorT && !upTriFactorT) { 1525 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1526 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1527 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1528 } 1529 1530 /* Get the GPU pointers */ 1531 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1532 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1533 1534 PetscCall(PetscLogGpuTimeBegin()); 1535 /* First, solve U */ 1536 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1537 upTriFactorT->csrMat->num_rows, 1538 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1539 upTriFactorT->csrMat->num_entries, 1540 #endif 1541 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1542 upTriFactorT->csrMat->values->data().get(), 1543 upTriFactorT->csrMat->row_offsets->data().get(), 1544 upTriFactorT->csrMat->column_indices->data().get(), 1545 upTriFactorT->solveInfo, 1546 barray, 1547 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1548 tempGPU->data().get(), 1549 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1550 #else 1551 tempGPU->data().get());PetscCallCUSPARSE(stat); 1552 #endif 1553 1554 /* Then, solve L */ 1555 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1556 loTriFactorT->csrMat->num_rows, 1557 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1558 loTriFactorT->csrMat->num_entries, 1559 #endif 1560 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1561 loTriFactorT->csrMat->values->data().get(), 1562 loTriFactorT->csrMat->row_offsets->data().get(), 1563 loTriFactorT->csrMat->column_indices->data().get(), 1564 loTriFactorT->solveInfo, 1565 tempGPU->data().get(), 1566 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1567 xarray, 1568 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1569 #else 1570 xarray);PetscCallCUSPARSE(stat); 1571 #endif 1572 1573 /* restore */ 1574 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1575 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1576 PetscCall(PetscLogGpuTimeEnd()); 1577 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1582 { 1583 const PetscScalar *barray; 1584 PetscScalar *xarray; 1585 thrust::device_ptr<const PetscScalar> bGPU; 1586 thrust::device_ptr<PetscScalar> xGPU; 1587 cusparseStatus_t stat; 1588 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1589 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1590 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1591 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1592 1593 PetscFunctionBegin; 1594 1595 /* Get the GPU pointers */ 1596 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1597 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1598 xGPU = thrust::device_pointer_cast(xarray); 1599 bGPU = thrust::device_pointer_cast(barray); 1600 1601 PetscCall(PetscLogGpuTimeBegin()); 1602 /* First, reorder with the row permutation */ 1603 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1604 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1605 tempGPU->begin()); 1606 1607 /* Next, solve L */ 1608 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1609 loTriFactor->csrMat->num_rows, 1610 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1611 loTriFactor->csrMat->num_entries, 1612 #endif 1613 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1614 loTriFactor->csrMat->values->data().get(), 1615 loTriFactor->csrMat->row_offsets->data().get(), 1616 loTriFactor->csrMat->column_indices->data().get(), 1617 loTriFactor->solveInfo, 1618 tempGPU->data().get(), 1619 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1620 xarray, 1621 loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1622 #else 1623 xarray);PetscCallCUSPARSE(stat); 1624 #endif 1625 1626 /* Then, solve U */ 1627 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1628 upTriFactor->csrMat->num_rows, 1629 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1630 upTriFactor->csrMat->num_entries, 1631 #endif 1632 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1633 upTriFactor->csrMat->values->data().get(), 1634 upTriFactor->csrMat->row_offsets->data().get(), 1635 upTriFactor->csrMat->column_indices->data().get(), 1636 upTriFactor->solveInfo,xarray, 1637 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1638 tempGPU->data().get(), 1639 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1640 #else 1641 tempGPU->data().get());PetscCallCUSPARSE(stat); 1642 #endif 1643 1644 /* Last, reorder with the column permutation */ 1645 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1646 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1647 xGPU); 1648 1649 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1650 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1651 PetscCall(PetscLogGpuTimeEnd()); 1652 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1653 PetscFunctionReturn(0); 1654 } 1655 1656 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1657 { 1658 const PetscScalar *barray; 1659 PetscScalar *xarray; 1660 cusparseStatus_t stat; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1665 1666 PetscFunctionBegin; 1667 /* Get the GPU pointers */ 1668 PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 1669 PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1670 1671 PetscCall(PetscLogGpuTimeBegin()); 1672 /* First, solve L */ 1673 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1674 loTriFactor->csrMat->num_rows, 1675 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1676 loTriFactor->csrMat->num_entries, 1677 #endif 1678 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1679 loTriFactor->csrMat->values->data().get(), 1680 loTriFactor->csrMat->row_offsets->data().get(), 1681 loTriFactor->csrMat->column_indices->data().get(), 1682 loTriFactor->solveInfo, 1683 barray, 1684 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1685 tempGPU->data().get(), 1686 loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1687 #else 1688 tempGPU->data().get());PetscCallCUSPARSE(stat); 1689 #endif 1690 1691 /* Next, solve U */ 1692 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1693 upTriFactor->csrMat->num_rows, 1694 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1695 upTriFactor->csrMat->num_entries, 1696 #endif 1697 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1698 upTriFactor->csrMat->values->data().get(), 1699 upTriFactor->csrMat->row_offsets->data().get(), 1700 upTriFactor->csrMat->column_indices->data().get(), 1701 upTriFactor->solveInfo, 1702 tempGPU->data().get(), 1703 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1704 xarray, 1705 upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1706 #else 1707 xarray);PetscCallCUSPARSE(stat); 1708 #endif 1709 1710 PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 1711 PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 1712 PetscCall(PetscLogGpuTimeEnd()); 1713 PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1714 PetscFunctionReturn(0); 1715 } 1716 1717 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1718 { 1719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1720 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1721 1722 PetscFunctionBegin; 1723 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1724 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1725 1726 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1727 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 1728 PetscCallCUDA(WaitForCUDA()); 1729 PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 1730 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 1731 A->offloadmask = PETSC_OFFLOAD_BOTH; 1732 } 1733 PetscFunctionReturn(0); 1734 } 1735 1736 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1737 { 1738 PetscFunctionBegin; 1739 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1740 *array = ((Mat_SeqAIJ*)A->data)->a; 1741 PetscFunctionReturn(0); 1742 } 1743 1744 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1745 { 1746 PetscFunctionBegin; 1747 A->offloadmask = PETSC_OFFLOAD_CPU; 1748 *array = NULL; 1749 PetscFunctionReturn(0); 1750 } 1751 1752 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1753 { 1754 PetscFunctionBegin; 1755 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1756 *array = ((Mat_SeqAIJ*)A->data)->a; 1757 PetscFunctionReturn(0); 1758 } 1759 1760 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 1761 { 1762 PetscFunctionBegin; 1763 *array = NULL; 1764 PetscFunctionReturn(0); 1765 } 1766 1767 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1768 { 1769 PetscFunctionBegin; 1770 *array = ((Mat_SeqAIJ*)A->data)->a; 1771 PetscFunctionReturn(0); 1772 } 1773 1774 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1775 { 1776 PetscFunctionBegin; 1777 A->offloadmask = PETSC_OFFLOAD_CPU; 1778 *array = NULL; 1779 PetscFunctionReturn(0); 1780 } 1781 1782 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1783 { 1784 Mat_SeqAIJCUSPARSE *cusp; 1785 CsrMatrix *matrix; 1786 1787 PetscFunctionBegin; 1788 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1789 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 1790 cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 1791 PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 1792 matrix = (CsrMatrix*)cusp->mat->mat; 1793 1794 if (i) { 1795 #if !defined(PETSC_USE_64BIT_INDICES) 1796 *i = matrix->row_offsets->data().get(); 1797 #else 1798 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1799 #endif 1800 } 1801 if (j) { 1802 #if !defined(PETSC_USE_64BIT_INDICES) 1803 *j = matrix->column_indices->data().get(); 1804 #else 1805 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 1806 #endif 1807 } 1808 if (a) *a = matrix->values->data().get(); 1809 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 1810 PetscFunctionReturn(0); 1811 } 1812 1813 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1814 { 1815 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1816 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1817 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1818 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1819 cusparseStatus_t stat; 1820 PetscBool both = PETSC_TRUE; 1821 1822 PetscFunctionBegin; 1823 PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1824 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1825 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1826 CsrMatrix *matrix; 1827 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1828 1829 PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1830 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1831 matrix->values->assign(a->a, a->a+a->nz); 1832 PetscCallCUDA(WaitForCUDA()); 1833 PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 1834 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1835 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 1836 } else { 1837 PetscInt nnz; 1838 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1839 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 1840 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1841 delete cusparsestruct->workVector; 1842 delete cusparsestruct->rowoffsets_gpu; 1843 cusparsestruct->workVector = NULL; 1844 cusparsestruct->rowoffsets_gpu = NULL; 1845 try { 1846 if (a->compressedrow.use) { 1847 m = a->compressedrow.nrows; 1848 ii = a->compressedrow.i; 1849 ridx = a->compressedrow.rindex; 1850 } else { 1851 m = A->rmap->n; 1852 ii = a->i; 1853 ridx = NULL; 1854 } 1855 PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1856 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1857 else nnz = a->nz; 1858 PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1859 1860 /* create cusparse matrix */ 1861 cusparsestruct->nrows = m; 1862 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1863 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 1864 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 1865 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1866 1867 PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 1868 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 1869 PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 1870 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1871 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1872 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1873 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1874 1875 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1876 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1877 /* set the matrix */ 1878 CsrMatrix *mat= new CsrMatrix; 1879 mat->num_rows = m; 1880 mat->num_cols = A->cmap->n; 1881 mat->num_entries = nnz; 1882 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1883 mat->row_offsets->assign(ii, ii + m+1); 1884 1885 mat->column_indices = new THRUSTINTARRAY32(nnz); 1886 mat->column_indices->assign(a->j, a->j+nnz); 1887 1888 mat->values = new THRUSTARRAY(nnz); 1889 if (a->a) mat->values->assign(a->a, a->a+nnz); 1890 1891 /* assign the pointer */ 1892 matstruct->mat = mat; 1893 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1894 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1895 stat = cusparseCreateCsr(&matstruct->matDescr, 1896 mat->num_rows, mat->num_cols, mat->num_entries, 1897 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1898 mat->values->data().get(), 1899 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1900 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1901 } 1902 #endif 1903 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1904 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1905 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1906 #else 1907 CsrMatrix *mat= new CsrMatrix; 1908 mat->num_rows = m; 1909 mat->num_cols = A->cmap->n; 1910 mat->num_entries = nnz; 1911 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1912 mat->row_offsets->assign(ii, ii + m+1); 1913 1914 mat->column_indices = new THRUSTINTARRAY32(nnz); 1915 mat->column_indices->assign(a->j, a->j+nnz); 1916 1917 mat->values = new THRUSTARRAY(nnz); 1918 if (a->a) mat->values->assign(a->a, a->a+nnz); 1919 1920 cusparseHybMat_t hybMat; 1921 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1922 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1923 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1924 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1925 matstruct->descr, mat->values->data().get(), 1926 mat->row_offsets->data().get(), 1927 mat->column_indices->data().get(), 1928 hybMat, 0, partition);PetscCallCUSPARSE(stat); 1929 /* assign the pointer */ 1930 matstruct->mat = hybMat; 1931 1932 if (mat) { 1933 if (mat->values) delete (THRUSTARRAY*)mat->values; 1934 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1935 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1936 delete (CsrMatrix*)mat; 1937 } 1938 #endif 1939 } 1940 1941 /* assign the compressed row indices */ 1942 if (a->compressedrow.use) { 1943 cusparsestruct->workVector = new THRUSTARRAY(m); 1944 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1945 matstruct->cprowIndices->assign(ridx,ridx+m); 1946 tmp = m; 1947 } else { 1948 cusparsestruct->workVector = NULL; 1949 matstruct->cprowIndices = NULL; 1950 tmp = 0; 1951 } 1952 PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1953 1954 /* assign the pointer */ 1955 cusparsestruct->mat = matstruct; 1956 } catch(char *ex) { 1957 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1958 } 1959 PetscCallCUDA(WaitForCUDA()); 1960 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1961 cusparsestruct->nonzerostate = A->nonzerostate; 1962 } 1963 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1964 } 1965 PetscFunctionReturn(0); 1966 } 1967 1968 struct VecCUDAPlusEquals 1969 { 1970 template <typename Tuple> 1971 __host__ __device__ 1972 void operator()(Tuple t) 1973 { 1974 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1975 } 1976 }; 1977 1978 struct VecCUDAEquals 1979 { 1980 template <typename Tuple> 1981 __host__ __device__ 1982 void operator()(Tuple t) 1983 { 1984 thrust::get<1>(t) = thrust::get<0>(t); 1985 } 1986 }; 1987 1988 struct VecCUDAEqualsReverse 1989 { 1990 template <typename Tuple> 1991 __host__ __device__ 1992 void operator()(Tuple t) 1993 { 1994 thrust::get<0>(t) = thrust::get<1>(t); 1995 } 1996 }; 1997 1998 struct MatMatCusparse { 1999 PetscBool cisdense; 2000 PetscScalar *Bt; 2001 Mat X; 2002 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2003 PetscLogDouble flops; 2004 CsrMatrix *Bcsr; 2005 2006 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2007 cusparseSpMatDescr_t matSpBDescr; 2008 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2009 cusparseDnMatDescr_t matBDescr; 2010 cusparseDnMatDescr_t matCDescr; 2011 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2012 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2013 void *dBuffer4; 2014 void *dBuffer5; 2015 #endif 2016 size_t mmBufferSize; 2017 void *mmBuffer; 2018 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2019 cusparseSpGEMMDescr_t spgemmDesc; 2020 #endif 2021 }; 2022 2023 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2024 { 2025 MatMatCusparse *mmdata = (MatMatCusparse *)data; 2026 2027 PetscFunctionBegin; 2028 PetscCallCUDA(cudaFree(mmdata->Bt)); 2029 delete mmdata->Bcsr; 2030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2031 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2032 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2033 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2034 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2035 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2036 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2037 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2038 #endif 2039 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2040 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2041 #endif 2042 PetscCall(MatDestroy(&mmdata->X)); 2043 PetscCall(PetscFree(data)); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2048 2049 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2050 { 2051 Mat_Product *product = C->product; 2052 Mat A,B; 2053 PetscInt m,n,blda,clda; 2054 PetscBool flg,biscuda; 2055 Mat_SeqAIJCUSPARSE *cusp; 2056 cusparseStatus_t stat; 2057 cusparseOperation_t opA; 2058 const PetscScalar *barray; 2059 PetscScalar *carray; 2060 MatMatCusparse *mmdata; 2061 Mat_SeqAIJCUSPARSEMultStruct *mat; 2062 CsrMatrix *csrmat; 2063 2064 PetscFunctionBegin; 2065 MatCheckProduct(C,1); 2066 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2067 mmdata = (MatMatCusparse*)product->data; 2068 A = product->A; 2069 B = product->B; 2070 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2071 PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2072 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2073 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2074 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2075 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2076 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2077 switch (product->type) { 2078 case MATPRODUCT_AB: 2079 case MATPRODUCT_PtAP: 2080 mat = cusp->mat; 2081 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2082 m = A->rmap->n; 2083 n = B->cmap->n; 2084 break; 2085 case MATPRODUCT_AtB: 2086 if (!A->form_explicit_transpose) { 2087 mat = cusp->mat; 2088 opA = CUSPARSE_OPERATION_TRANSPOSE; 2089 } else { 2090 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2091 mat = cusp->matTranspose; 2092 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2093 } 2094 m = A->cmap->n; 2095 n = B->cmap->n; 2096 break; 2097 case MATPRODUCT_ABt: 2098 case MATPRODUCT_RARt: 2099 mat = cusp->mat; 2100 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2101 m = A->rmap->n; 2102 n = B->rmap->n; 2103 break; 2104 default: 2105 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2106 } 2107 PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2108 csrmat = (CsrMatrix*)mat->mat; 2109 /* if the user passed a CPU matrix, copy the data to the GPU */ 2110 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 2111 if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 2112 PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2113 2114 PetscCall(MatDenseGetLDA(B,&blda)); 2115 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2116 PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 2117 PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2118 } else { 2119 PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 2120 PetscCall(MatDenseGetLDA(C,&clda)); 2121 } 2122 2123 PetscCall(PetscLogGpuTimeBegin()); 2124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2125 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2126 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2127 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2128 size_t mmBufferSize; 2129 if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2130 if (!mmdata->matBDescr) { 2131 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2132 mmdata->Blda = blda; 2133 } 2134 2135 if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2136 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2137 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2138 mmdata->Clda = clda; 2139 } 2140 2141 if (!mat->matDescr) { 2142 stat = cusparseCreateCsr(&mat->matDescr, 2143 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2144 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2145 csrmat->values->data().get(), 2146 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2147 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2148 } 2149 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2150 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2151 mmdata->matCDescr,cusparse_scalartype, 2152 cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2153 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2154 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2155 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2156 mmdata->mmBufferSize = mmBufferSize; 2157 } 2158 mmdata->initialized = PETSC_TRUE; 2159 } else { 2160 /* to be safe, always update pointers of the mats */ 2161 PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 2162 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 2163 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2164 } 2165 2166 /* do cusparseSpMM, which supports transpose on B */ 2167 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2168 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2169 mmdata->matCDescr,cusparse_scalartype, 2170 cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2171 #else 2172 PetscInt k; 2173 /* cusparseXcsrmm does not support transpose on B */ 2174 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2175 cublasHandle_t cublasv2handle; 2176 cublasStatus_t cerr; 2177 2178 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2179 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2180 B->cmap->n,B->rmap->n, 2181 &PETSC_CUSPARSE_ONE ,barray,blda, 2182 &PETSC_CUSPARSE_ZERO,barray,blda, 2183 mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2184 blda = B->cmap->n; 2185 k = B->cmap->n; 2186 } else { 2187 k = B->rmap->n; 2188 } 2189 2190 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2191 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2192 csrmat->num_entries,mat->alpha_one,mat->descr, 2193 csrmat->values->data().get(), 2194 csrmat->row_offsets->data().get(), 2195 csrmat->column_indices->data().get(), 2196 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2197 carray,clda);PetscCallCUSPARSE(stat); 2198 #endif 2199 PetscCall(PetscLogGpuTimeEnd()); 2200 PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 2201 PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2202 if (product->type == MATPRODUCT_RARt) { 2203 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2204 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2205 } else if (product->type == MATPRODUCT_PtAP) { 2206 PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 2207 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2208 } else { 2209 PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2210 } 2211 if (mmdata->cisdense) { 2212 PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2213 } 2214 if (!biscuda) { 2215 PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2216 } 2217 PetscFunctionReturn(0); 2218 } 2219 2220 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2221 { 2222 Mat_Product *product = C->product; 2223 Mat A,B; 2224 PetscInt m,n; 2225 PetscBool cisdense,flg; 2226 MatMatCusparse *mmdata; 2227 Mat_SeqAIJCUSPARSE *cusp; 2228 2229 PetscFunctionBegin; 2230 MatCheckProduct(C,1); 2231 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2232 A = product->A; 2233 B = product->B; 2234 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2235 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2236 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2237 PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2238 switch (product->type) { 2239 case MATPRODUCT_AB: 2240 m = A->rmap->n; 2241 n = B->cmap->n; 2242 break; 2243 case MATPRODUCT_AtB: 2244 m = A->cmap->n; 2245 n = B->cmap->n; 2246 break; 2247 case MATPRODUCT_ABt: 2248 m = A->rmap->n; 2249 n = B->rmap->n; 2250 break; 2251 case MATPRODUCT_PtAP: 2252 m = B->cmap->n; 2253 n = B->cmap->n; 2254 break; 2255 case MATPRODUCT_RARt: 2256 m = B->rmap->n; 2257 n = B->rmap->n; 2258 break; 2259 default: 2260 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2261 } 2262 PetscCall(MatSetSizes(C,m,n,m,n)); 2263 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2264 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 2265 PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2266 2267 /* product data */ 2268 PetscCall(PetscNew(&mmdata)); 2269 mmdata->cisdense = cisdense; 2270 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2271 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2272 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2273 PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2274 } 2275 #endif 2276 /* for these products we need intermediate storage */ 2277 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2278 PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 2279 PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2280 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2281 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2282 } else { 2283 PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2284 } 2285 } 2286 C->product->data = mmdata; 2287 C->product->destroy = MatDestroy_MatMatCusparse; 2288 2289 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2290 PetscFunctionReturn(0); 2291 } 2292 2293 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2294 { 2295 Mat_Product *product = C->product; 2296 Mat A,B; 2297 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2298 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2299 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2300 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2301 PetscBool flg; 2302 cusparseStatus_t stat; 2303 MatProductType ptype; 2304 MatMatCusparse *mmdata; 2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2306 cusparseSpMatDescr_t BmatSpDescr; 2307 #endif 2308 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2309 2310 PetscFunctionBegin; 2311 MatCheckProduct(C,1); 2312 PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2313 PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 2314 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2315 mmdata = (MatMatCusparse*)C->product->data; 2316 A = product->A; 2317 B = product->B; 2318 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2319 mmdata->reusesym = PETSC_FALSE; 2320 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2321 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2322 Cmat = Ccusp->mat; 2323 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2324 Ccsr = (CsrMatrix*)Cmat->mat; 2325 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2326 goto finalize; 2327 } 2328 if (!c->nz) goto finalize; 2329 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2330 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2331 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2332 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2333 PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2334 PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2335 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2336 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2337 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2338 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2339 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2340 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2341 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2342 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2343 2344 ptype = product->type; 2345 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2346 ptype = MATPRODUCT_AB; 2347 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2348 } 2349 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2350 ptype = MATPRODUCT_AB; 2351 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2352 } 2353 switch (ptype) { 2354 case MATPRODUCT_AB: 2355 Amat = Acusp->mat; 2356 Bmat = Bcusp->mat; 2357 break; 2358 case MATPRODUCT_AtB: 2359 Amat = Acusp->matTranspose; 2360 Bmat = Bcusp->mat; 2361 break; 2362 case MATPRODUCT_ABt: 2363 Amat = Acusp->mat; 2364 Bmat = Bcusp->matTranspose; 2365 break; 2366 default: 2367 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2368 } 2369 Cmat = Ccusp->mat; 2370 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2371 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2372 PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2373 Acsr = (CsrMatrix*)Amat->mat; 2374 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2375 Ccsr = (CsrMatrix*)Cmat->mat; 2376 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2377 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2378 PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2379 PetscCall(PetscLogGpuTimeBegin()); 2380 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2381 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2382 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2383 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2384 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2385 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2386 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2387 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2388 #else 2389 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2390 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2391 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2392 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2393 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2394 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2395 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2396 #endif 2397 #else 2398 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2399 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2400 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2401 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2402 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2403 #endif 2404 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2405 PetscCallCUDA(WaitForCUDA()); 2406 PetscCall(PetscLogGpuTimeEnd()); 2407 C->offloadmask = PETSC_OFFLOAD_GPU; 2408 finalize: 2409 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2410 PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 2411 PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 2412 PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2413 c->reallocs = 0; 2414 C->info.mallocs += 0; 2415 C->info.nz_unneeded = 0; 2416 C->assembled = C->was_assembled = PETSC_TRUE; 2417 C->num_ass++; 2418 PetscFunctionReturn(0); 2419 } 2420 2421 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2422 { 2423 Mat_Product *product = C->product; 2424 Mat A,B; 2425 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2426 Mat_SeqAIJ *a,*b,*c; 2427 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2428 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2429 PetscInt i,j,m,n,k; 2430 PetscBool flg; 2431 cusparseStatus_t stat; 2432 MatProductType ptype; 2433 MatMatCusparse *mmdata; 2434 PetscLogDouble flops; 2435 PetscBool biscompressed,ciscompressed; 2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2437 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2438 cusparseSpMatDescr_t BmatSpDescr; 2439 #else 2440 int cnz; 2441 #endif 2442 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2443 2444 PetscFunctionBegin; 2445 MatCheckProduct(C,1); 2446 PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2447 A = product->A; 2448 B = product->B; 2449 PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2450 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2451 PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 2452 PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2453 a = (Mat_SeqAIJ*)A->data; 2454 b = (Mat_SeqAIJ*)B->data; 2455 /* product data */ 2456 PetscCall(PetscNew(&mmdata)); 2457 C->product->data = mmdata; 2458 C->product->destroy = MatDestroy_MatMatCusparse; 2459 2460 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2461 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2462 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2463 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2464 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2465 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2466 2467 ptype = product->type; 2468 if (A->symmetric && ptype == MATPRODUCT_AtB) { 2469 ptype = MATPRODUCT_AB; 2470 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2471 } 2472 if (B->symmetric && ptype == MATPRODUCT_ABt) { 2473 ptype = MATPRODUCT_AB; 2474 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2475 } 2476 biscompressed = PETSC_FALSE; 2477 ciscompressed = PETSC_FALSE; 2478 switch (ptype) { 2479 case MATPRODUCT_AB: 2480 m = A->rmap->n; 2481 n = B->cmap->n; 2482 k = A->cmap->n; 2483 Amat = Acusp->mat; 2484 Bmat = Bcusp->mat; 2485 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2486 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2487 break; 2488 case MATPRODUCT_AtB: 2489 m = A->cmap->n; 2490 n = B->cmap->n; 2491 k = A->rmap->n; 2492 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2493 Amat = Acusp->matTranspose; 2494 Bmat = Bcusp->mat; 2495 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2496 break; 2497 case MATPRODUCT_ABt: 2498 m = A->rmap->n; 2499 n = B->rmap->n; 2500 k = A->cmap->n; 2501 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2502 Amat = Acusp->mat; 2503 Bmat = Bcusp->matTranspose; 2504 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2505 break; 2506 default: 2507 SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2508 } 2509 2510 /* create cusparse matrix */ 2511 PetscCall(MatSetSizes(C,m,n,m,n)); 2512 PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2513 c = (Mat_SeqAIJ*)C->data; 2514 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2515 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2516 Ccsr = new CsrMatrix; 2517 2518 c->compressedrow.use = ciscompressed; 2519 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2520 c->compressedrow.nrows = a->compressedrow.nrows; 2521 PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 2522 PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2523 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2524 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2525 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2526 } else { 2527 c->compressedrow.nrows = 0; 2528 c->compressedrow.i = NULL; 2529 c->compressedrow.rindex = NULL; 2530 Ccusp->workVector = NULL; 2531 Cmat->cprowIndices = NULL; 2532 } 2533 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2534 Ccusp->mat = Cmat; 2535 Ccusp->mat->mat = Ccsr; 2536 Ccsr->num_rows = Ccusp->nrows; 2537 Ccsr->num_cols = n; 2538 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2539 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 2540 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 2541 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2542 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 2543 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 2544 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 2545 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2546 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2547 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2548 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2549 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2550 c->nz = 0; 2551 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2552 Ccsr->values = new THRUSTARRAY(c->nz); 2553 goto finalizesym; 2554 } 2555 2556 PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2557 PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2558 Acsr = (CsrMatrix*)Amat->mat; 2559 if (!biscompressed) { 2560 Bcsr = (CsrMatrix*)Bmat->mat; 2561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2562 BmatSpDescr = Bmat->matDescr; 2563 #endif 2564 } else { /* we need to use row offsets for the full matrix */ 2565 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2566 Bcsr = new CsrMatrix; 2567 Bcsr->num_rows = B->rmap->n; 2568 Bcsr->num_cols = cBcsr->num_cols; 2569 Bcsr->num_entries = cBcsr->num_entries; 2570 Bcsr->column_indices = cBcsr->column_indices; 2571 Bcsr->values = cBcsr->values; 2572 if (!Bcusp->rowoffsets_gpu) { 2573 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2574 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2575 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2576 } 2577 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2578 mmdata->Bcsr = Bcsr; 2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2580 if (Bcsr->num_rows && Bcsr->num_cols) { 2581 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2582 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2583 Bcsr->values->data().get(), 2584 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2585 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2586 } 2587 BmatSpDescr = mmdata->matSpBDescr; 2588 #endif 2589 } 2590 PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2591 PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2592 /* precompute flops count */ 2593 if (ptype == MATPRODUCT_AB) { 2594 for (i=0, flops = 0; i<A->rmap->n; i++) { 2595 const PetscInt st = a->i[i]; 2596 const PetscInt en = a->i[i+1]; 2597 for (j=st; j<en; j++) { 2598 const PetscInt brow = a->j[j]; 2599 flops += 2.*(b->i[brow+1] - b->i[brow]); 2600 } 2601 } 2602 } else if (ptype == MATPRODUCT_AtB) { 2603 for (i=0, flops = 0; i<A->rmap->n; i++) { 2604 const PetscInt anzi = a->i[i+1] - a->i[i]; 2605 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2606 flops += (2.*anzi)*bnzi; 2607 } 2608 } else { /* TODO */ 2609 flops = 0.; 2610 } 2611 2612 mmdata->flops = flops; 2613 PetscCall(PetscLogGpuTimeBegin()); 2614 2615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2616 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2617 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2618 NULL, NULL, NULL, 2619 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2620 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2621 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2622 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2623 { 2624 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2625 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2626 */ 2627 void* dBuffer1 = NULL; 2628 void* dBuffer2 = NULL; 2629 void* dBuffer3 = NULL; 2630 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2631 size_t bufferSize1 = 0; 2632 size_t bufferSize2 = 0; 2633 size_t bufferSize3 = 0; 2634 size_t bufferSize4 = 0; 2635 size_t bufferSize5 = 0; 2636 2637 /*----------------------------------------------------------------------*/ 2638 /* ask bufferSize1 bytes for external memory */ 2639 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2640 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2641 &bufferSize1, NULL);PetscCallCUSPARSE(stat); 2642 PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2643 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2644 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2645 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2646 &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2647 2648 /*----------------------------------------------------------------------*/ 2649 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2650 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2651 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 2652 PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 2653 PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 2654 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2655 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2656 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2657 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 2658 PetscCallCUDA(cudaFree(dBuffer1)); 2659 PetscCallCUDA(cudaFree(dBuffer2)); 2660 2661 /*----------------------------------------------------------------------*/ 2662 /* get matrix C non-zero entries C_nnz1 */ 2663 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2664 c->nz = (PetscInt) C_nnz1; 2665 /* allocate matrix C */ 2666 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2667 Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2668 /* update matC with the new pointers */ 2669 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2670 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2671 2672 /*----------------------------------------------------------------------*/ 2673 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2674 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2675 &bufferSize5, NULL);PetscCallCUSPARSE(stat); 2676 PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2677 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2678 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2679 &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 2680 PetscCallCUDA(cudaFree(dBuffer3)); 2681 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2682 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2684 mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2685 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2686 } 2687 #else 2688 size_t bufSize2; 2689 /* ask bufferSize bytes for external memory */ 2690 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2691 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2692 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2693 mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 2694 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2695 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2696 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2697 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2698 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2699 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2700 /* ask bufferSize again bytes for external memory */ 2701 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2702 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2703 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2704 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2705 /* The CUSPARSE documentation is not clear, nor the API 2706 We need both buffers to perform the operations properly! 2707 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2708 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2709 is stored in the descriptor! What a messy API... */ 2710 PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2711 /* compute the intermediate product of A * B */ 2712 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2713 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2714 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2715 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2716 /* get matrix C non-zero entries C_nnz1 */ 2717 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2718 c->nz = (PetscInt) C_nnz1; 2719 PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2720 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2721 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722 Ccsr->values = new THRUSTARRAY(c->nz); 2723 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2724 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2725 Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2726 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2727 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2728 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2729 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2730 #else 2731 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2732 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2733 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2734 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2735 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2736 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2737 c->nz = cnz; 2738 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2739 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2740 Ccsr->values = new THRUSTARRAY(c->nz); 2741 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2742 2743 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2744 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2745 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2746 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2747 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2748 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2749 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2750 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2751 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2752 #endif 2753 PetscCall(PetscLogGpuFlops(mmdata->flops)); 2754 PetscCall(PetscLogGpuTimeEnd()); 2755 finalizesym: 2756 c->singlemalloc = PETSC_FALSE; 2757 c->free_a = PETSC_TRUE; 2758 c->free_ij = PETSC_TRUE; 2759 PetscCall(PetscMalloc1(m+1,&c->i)); 2760 PetscCall(PetscMalloc1(c->nz,&c->j)); 2761 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2762 PetscInt *d_i = c->i; 2763 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2764 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2765 ii = *Ccsr->row_offsets; 2766 jj = *Ccsr->column_indices; 2767 if (ciscompressed) d_i = c->compressedrow.i; 2768 PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2769 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2770 } else { 2771 PetscInt *d_i = c->i; 2772 if (ciscompressed) d_i = c->compressedrow.i; 2773 PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2774 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2775 } 2776 if (ciscompressed) { /* need to expand host row offsets */ 2777 PetscInt r = 0; 2778 c->i[0] = 0; 2779 for (k = 0; k < c->compressedrow.nrows; k++) { 2780 const PetscInt next = c->compressedrow.rindex[k]; 2781 const PetscInt old = c->compressedrow.i[k]; 2782 for (; r < next; r++) c->i[r+1] = old; 2783 } 2784 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2785 } 2786 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 2787 PetscCall(PetscMalloc1(m,&c->ilen)); 2788 PetscCall(PetscMalloc1(m,&c->imax)); 2789 c->maxnz = c->nz; 2790 c->nonzerorowcnt = 0; 2791 c->rmax = 0; 2792 for (k = 0; k < m; k++) { 2793 const PetscInt nn = c->i[k+1] - c->i[k]; 2794 c->ilen[k] = c->imax[k] = nn; 2795 c->nonzerorowcnt += (PetscInt)!!nn; 2796 c->rmax = PetscMax(c->rmax,nn); 2797 } 2798 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 2799 PetscCall(PetscMalloc1(c->nz,&c->a)); 2800 Ccsr->num_entries = c->nz; 2801 2802 C->nonzerostate++; 2803 PetscCall(PetscLayoutSetUp(C->rmap)); 2804 PetscCall(PetscLayoutSetUp(C->cmap)); 2805 Ccusp->nonzerostate = C->nonzerostate; 2806 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2807 C->preallocated = PETSC_TRUE; 2808 C->assembled = PETSC_FALSE; 2809 C->was_assembled = PETSC_FALSE; 2810 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2811 mmdata->reusesym = PETSC_TRUE; 2812 C->offloadmask = PETSC_OFFLOAD_GPU; 2813 } 2814 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2815 PetscFunctionReturn(0); 2816 } 2817 2818 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2819 2820 /* handles sparse or dense B */ 2821 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2822 { 2823 Mat_Product *product = mat->product; 2824 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2825 2826 PetscFunctionBegin; 2827 MatCheckProduct(mat,1); 2828 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2829 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2830 PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2831 } 2832 if (product->type == MATPRODUCT_ABC) { 2833 Ciscusp = PETSC_FALSE; 2834 if (!product->C->boundtocpu) { 2835 PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2836 } 2837 } 2838 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2839 PetscBool usecpu = PETSC_FALSE; 2840 switch (product->type) { 2841 case MATPRODUCT_AB: 2842 if (product->api_user) { 2843 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 2844 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2845 PetscOptionsEnd(); 2846 } else { 2847 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 2848 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2849 PetscOptionsEnd(); 2850 } 2851 break; 2852 case MATPRODUCT_AtB: 2853 if (product->api_user) { 2854 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 2855 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2856 PetscOptionsEnd(); 2857 } else { 2858 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 2859 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2860 PetscOptionsEnd(); 2861 } 2862 break; 2863 case MATPRODUCT_PtAP: 2864 if (product->api_user) { 2865 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 2866 PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2867 PetscOptionsEnd(); 2868 } else { 2869 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 2870 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2871 PetscOptionsEnd(); 2872 } 2873 break; 2874 case MATPRODUCT_RARt: 2875 if (product->api_user) { 2876 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 2877 PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2878 PetscOptionsEnd(); 2879 } else { 2880 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 2881 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2882 PetscOptionsEnd(); 2883 } 2884 break; 2885 case MATPRODUCT_ABC: 2886 if (product->api_user) { 2887 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 2888 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2889 PetscOptionsEnd(); 2890 } else { 2891 PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 2892 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2893 PetscOptionsEnd(); 2894 } 2895 break; 2896 default: 2897 break; 2898 } 2899 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2900 } 2901 /* dispatch */ 2902 if (isdense) { 2903 switch (product->type) { 2904 case MATPRODUCT_AB: 2905 case MATPRODUCT_AtB: 2906 case MATPRODUCT_ABt: 2907 case MATPRODUCT_PtAP: 2908 case MATPRODUCT_RARt: 2909 if (product->A->boundtocpu) { 2910 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2911 } else { 2912 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2913 } 2914 break; 2915 case MATPRODUCT_ABC: 2916 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2917 break; 2918 default: 2919 break; 2920 } 2921 } else if (Biscusp && Ciscusp) { 2922 switch (product->type) { 2923 case MATPRODUCT_AB: 2924 case MATPRODUCT_AtB: 2925 case MATPRODUCT_ABt: 2926 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2927 break; 2928 case MATPRODUCT_PtAP: 2929 case MATPRODUCT_RARt: 2930 case MATPRODUCT_ABC: 2931 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2932 break; 2933 default: 2934 break; 2935 } 2936 } else { /* fallback for AIJ */ 2937 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2938 } 2939 PetscFunctionReturn(0); 2940 } 2941 2942 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2943 { 2944 PetscFunctionBegin; 2945 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2946 PetscFunctionReturn(0); 2947 } 2948 2949 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2950 { 2951 PetscFunctionBegin; 2952 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2953 PetscFunctionReturn(0); 2954 } 2955 2956 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2957 { 2958 PetscFunctionBegin; 2959 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2960 PetscFunctionReturn(0); 2961 } 2962 2963 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2964 { 2965 PetscFunctionBegin; 2966 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 2967 PetscFunctionReturn(0); 2968 } 2969 2970 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2971 { 2972 PetscFunctionBegin; 2973 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2974 PetscFunctionReturn(0); 2975 } 2976 2977 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2978 { 2979 int i = blockIdx.x*blockDim.x + threadIdx.x; 2980 if (i < n) y[idx[i]] += x[i]; 2981 } 2982 2983 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2984 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2985 { 2986 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2987 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2988 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2989 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2990 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2991 PetscBool compressed; 2992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2993 PetscInt nx,ny; 2994 #endif 2995 2996 PetscFunctionBegin; 2997 PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2998 if (!a->nz) { 2999 if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 3000 else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3001 PetscFunctionReturn(0); 3002 } 3003 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3004 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3005 if (!trans) { 3006 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3007 PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3008 } else { 3009 if (herm || !A->form_explicit_transpose) { 3010 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3011 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3012 } else { 3013 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3014 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3015 } 3016 } 3017 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3018 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3019 3020 try { 3021 PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 3022 if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 3023 else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3024 3025 PetscCall(PetscLogGpuTimeBegin()); 3026 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3027 /* z = A x + beta y. 3028 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3029 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3030 */ 3031 xptr = xarray; 3032 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3033 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3035 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3036 allocated to accommodate different uses. So we get the length info directly from mat. 3037 */ 3038 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3039 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3040 nx = mat->num_cols; 3041 ny = mat->num_rows; 3042 } 3043 #endif 3044 } else { 3045 /* z = A^T x + beta y 3046 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3047 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3048 */ 3049 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3050 dptr = zarray; 3051 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3052 if (compressed) { /* Scatter x to work vector */ 3053 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3054 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3055 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3056 VecCUDAEqualsReverse()); 3057 } 3058 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3059 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3060 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3061 nx = mat->num_rows; 3062 ny = mat->num_cols; 3063 } 3064 #endif 3065 } 3066 3067 /* csr_spmv does y = alpha op(A) x + beta y */ 3068 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3070 PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3071 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3072 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 3073 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 3074 PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3075 matstruct->matDescr, 3076 matstruct->cuSpMV[opA].vecXDescr, beta, 3077 matstruct->cuSpMV[opA].vecYDescr, 3078 cusparse_scalartype, 3079 cusparsestruct->spmvAlg, 3080 &matstruct->cuSpMV[opA].spmvBufferSize)); 3081 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3082 3083 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3084 } else { 3085 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3086 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 3087 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3088 } 3089 3090 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3091 matstruct->alpha_one, 3092 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3093 matstruct->cuSpMV[opA].vecXDescr, 3094 beta, 3095 matstruct->cuSpMV[opA].vecYDescr, 3096 cusparse_scalartype, 3097 cusparsestruct->spmvAlg, 3098 matstruct->cuSpMV[opA].spmvBuffer)); 3099 #else 3100 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3101 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3102 mat->num_rows, mat->num_cols, 3103 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3104 mat->values->data().get(), mat->row_offsets->data().get(), 3105 mat->column_indices->data().get(), xptr, beta, 3106 dptr)); 3107 #endif 3108 } else { 3109 if (cusparsestruct->nrows) { 3110 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3111 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3112 #else 3113 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3114 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3115 matstruct->alpha_one, matstruct->descr, hybMat, 3116 xptr, beta, 3117 dptr)); 3118 #endif 3119 } 3120 } 3121 PetscCall(PetscLogGpuTimeEnd()); 3122 3123 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3124 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3125 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3126 PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3127 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3128 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3129 } 3130 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3131 PetscCall(VecSet_SeqCUDA(zz,0)); 3132 } 3133 3134 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3135 if (compressed) { 3136 PetscCall(PetscLogGpuTimeBegin()); 3137 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3138 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3139 prevent that. So I just add a ScatterAdd kernel. 3140 */ 3141 #if 0 3142 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3143 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3144 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3145 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3146 VecCUDAPlusEquals()); 3147 #else 3148 PetscInt n = matstruct->cprowIndices->size(); 3149 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3150 #endif 3151 PetscCall(PetscLogGpuTimeEnd()); 3152 } 3153 } else { 3154 if (yy && yy != zz) { 3155 PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3156 } 3157 } 3158 PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 3159 if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 3160 else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 3161 } catch(char *ex) { 3162 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3163 } 3164 if (yy) { 3165 PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3166 } else { 3167 PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3168 } 3169 PetscFunctionReturn(0); 3170 } 3171 3172 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3173 { 3174 PetscFunctionBegin; 3175 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3176 PetscFunctionReturn(0); 3177 } 3178 3179 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3180 { 3181 PetscObjectState onnz = A->nonzerostate; 3182 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3183 3184 PetscFunctionBegin; 3185 PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3186 if (onnz != A->nonzerostate && cusp->deviceMat) { 3187 3188 PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 3189 PetscCallCUDA(cudaFree(cusp->deviceMat)); 3190 cusp->deviceMat = NULL; 3191 } 3192 PetscFunctionReturn(0); 3193 } 3194 3195 /* --------------------------------------------------------------------------------*/ 3196 /*@ 3197 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3198 (the default parallel PETSc format). This matrix will ultimately pushed down 3199 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3200 assembly performance the user should preallocate the matrix storage by setting 3201 the parameter nz (or the array nnz). By setting these parameters accurately, 3202 performance during matrix assembly can be increased by more than a factor of 50. 3203 3204 Collective 3205 3206 Input Parameters: 3207 + comm - MPI communicator, set to PETSC_COMM_SELF 3208 . m - number of rows 3209 . n - number of columns 3210 . nz - number of nonzeros per row (same for all rows) 3211 - nnz - array containing the number of nonzeros in the various rows 3212 (possibly different for each row) or NULL 3213 3214 Output Parameter: 3215 . A - the matrix 3216 3217 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3218 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3219 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3220 3221 Notes: 3222 If nnz is given then nz is ignored 3223 3224 The AIJ format (also called the Yale sparse matrix format or 3225 compressed row storage), is fully compatible with standard Fortran 77 3226 storage. That is, the stored row and column indices can begin at 3227 either one (as in Fortran) or zero. See the users' manual for details. 3228 3229 Specify the preallocated storage with either nz or nnz (not both). 3230 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3231 allocation. For large problems you MUST preallocate memory or you 3232 will get TERRIBLE performance, see the users' manual chapter on matrices. 3233 3234 By default, this format uses inodes (identical nodes) when possible, to 3235 improve numerical efficiency of matrix-vector products and solves. We 3236 search for consecutive rows with the same nonzero structure, thereby 3237 reusing matrix information to achieve increased efficiency. 3238 3239 Level: intermediate 3240 3241 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 3242 @*/ 3243 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3244 { 3245 PetscFunctionBegin; 3246 PetscCall(MatCreate(comm,A)); 3247 PetscCall(MatSetSizes(*A,m,n,m,n)); 3248 PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 3249 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3254 { 3255 PetscFunctionBegin; 3256 if (A->factortype == MAT_FACTOR_NONE) { 3257 PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 3258 } else { 3259 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3260 } 3261 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3262 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 3263 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 3264 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3265 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3266 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3267 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 3268 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3269 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3270 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 3271 PetscCall(MatDestroy_SeqAIJ(A)); 3272 PetscFunctionReturn(0); 3273 } 3274 3275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3276 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3277 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3278 { 3279 PetscFunctionBegin; 3280 PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 3281 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 3282 PetscFunctionReturn(0); 3283 } 3284 3285 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3286 { 3287 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3288 Mat_SeqAIJCUSPARSE *cy; 3289 Mat_SeqAIJCUSPARSE *cx; 3290 PetscScalar *ay; 3291 const PetscScalar *ax; 3292 CsrMatrix *csry,*csrx; 3293 3294 PetscFunctionBegin; 3295 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3296 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3297 if (X->ops->axpy != Y->ops->axpy) { 3298 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3299 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3300 PetscFunctionReturn(0); 3301 } 3302 /* if we are here, it means both matrices are bound to GPU */ 3303 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3304 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3305 PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3306 PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3307 csry = (CsrMatrix*)cy->mat->mat; 3308 csrx = (CsrMatrix*)cx->mat->mat; 3309 /* see if we can turn this into a cublas axpy */ 3310 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3311 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3312 if (eq) { 3313 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3314 } 3315 if (eq) str = SAME_NONZERO_PATTERN; 3316 } 3317 /* spgeam is buggy with one column */ 3318 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3319 3320 if (str == SUBSET_NONZERO_PATTERN) { 3321 PetscScalar b = 1.0; 3322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3323 size_t bufferSize; 3324 void *buffer; 3325 #endif 3326 3327 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3328 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3329 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3331 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3332 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3333 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3334 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 3335 PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 3336 PetscCall(PetscLogGpuTimeBegin()); 3337 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3338 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3340 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 3341 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3342 PetscCall(PetscLogGpuTimeEnd()); 3343 PetscCallCUDA(cudaFree(buffer)); 3344 #else 3345 PetscCall(PetscLogGpuTimeBegin()); 3346 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3347 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3348 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3349 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 3350 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3351 PetscCall(PetscLogGpuTimeEnd()); 3352 #endif 3353 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3354 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3355 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3356 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3357 } else if (str == SAME_NONZERO_PATTERN) { 3358 cublasHandle_t cublasv2handle; 3359 PetscBLASInt one = 1, bnz = 1; 3360 3361 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 3362 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3363 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3364 PetscCall(PetscBLASIntCast(x->nz,&bnz)); 3365 PetscCall(PetscLogGpuTimeBegin()); 3366 PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 3367 PetscCall(PetscLogGpuFlops(2.0*bnz)); 3368 PetscCall(PetscLogGpuTimeEnd()); 3369 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 3370 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3371 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3372 } else { 3373 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 3374 PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3375 } 3376 PetscFunctionReturn(0); 3377 } 3378 3379 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3380 { 3381 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3382 PetscScalar *ay; 3383 cublasHandle_t cublasv2handle; 3384 PetscBLASInt one = 1, bnz = 1; 3385 3386 PetscFunctionBegin; 3387 PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 3388 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3389 PetscCall(PetscBLASIntCast(y->nz,&bnz)); 3390 PetscCall(PetscLogGpuTimeBegin()); 3391 PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 3392 PetscCall(PetscLogGpuFlops(bnz)); 3393 PetscCall(PetscLogGpuTimeEnd()); 3394 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 3395 PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3396 PetscFunctionReturn(0); 3397 } 3398 3399 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3400 { 3401 PetscBool both = PETSC_FALSE; 3402 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3403 3404 PetscFunctionBegin; 3405 if (A->factortype == MAT_FACTOR_NONE) { 3406 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3407 if (spptr->mat) { 3408 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3409 if (matrix->values) { 3410 both = PETSC_TRUE; 3411 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3412 } 3413 } 3414 if (spptr->matTranspose) { 3415 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3416 if (matrix->values) { 3417 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3418 } 3419 } 3420 } 3421 PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 3422 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 3423 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3424 else A->offloadmask = PETSC_OFFLOAD_CPU; 3425 PetscFunctionReturn(0); 3426 } 3427 3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3429 { 3430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3431 3432 PetscFunctionBegin; 3433 if (A->factortype != MAT_FACTOR_NONE) { 3434 A->boundtocpu = flg; 3435 PetscFunctionReturn(0); 3436 } 3437 if (flg) { 3438 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3439 3440 A->ops->scale = MatScale_SeqAIJ; 3441 A->ops->axpy = MatAXPY_SeqAIJ; 3442 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3443 A->ops->mult = MatMult_SeqAIJ; 3444 A->ops->multadd = MatMultAdd_SeqAIJ; 3445 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3446 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3447 A->ops->multhermitiantranspose = NULL; 3448 A->ops->multhermitiantransposeadd = NULL; 3449 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3450 PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 3451 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 3452 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 3453 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 3454 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 3455 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 3456 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3457 } else { 3458 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3459 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3460 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3461 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3462 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3463 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3464 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3465 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3466 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3467 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3468 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 3469 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 3470 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 3471 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 3472 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 3473 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 3474 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 3475 3476 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 3477 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3478 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3479 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 3480 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 3481 PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3482 } 3483 A->boundtocpu = flg; 3484 if (flg && a->inode.size) { 3485 a->inode.use = PETSC_TRUE; 3486 } else { 3487 a->inode.use = PETSC_FALSE; 3488 } 3489 PetscFunctionReturn(0); 3490 } 3491 3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3493 { 3494 Mat B; 3495 3496 PetscFunctionBegin; 3497 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 3498 if (reuse == MAT_INITIAL_MATRIX) { 3499 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 3500 } else if (reuse == MAT_REUSE_MATRIX) { 3501 PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 3502 } 3503 B = *newmat; 3504 3505 PetscCall(PetscFree(B->defaultvectype)); 3506 PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 3507 3508 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3509 if (B->factortype == MAT_FACTOR_NONE) { 3510 Mat_SeqAIJCUSPARSE *spptr; 3511 PetscCall(PetscNew(&spptr)); 3512 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3513 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3514 spptr->format = MAT_CUSPARSE_CSR; 3515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3516 #if CUSPARSE_VERSION > 11301 3517 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3518 #else 3519 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3520 #endif 3521 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3522 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3523 #endif 3524 B->spptr = spptr; 3525 } else { 3526 Mat_SeqAIJCUSPARSETriFactors *spptr; 3527 3528 PetscCall(PetscNew(&spptr)); 3529 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 3530 PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3531 B->spptr = spptr; 3532 } 3533 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3534 } 3535 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3536 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3537 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3538 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3539 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3540 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3541 3542 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 3543 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 3544 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3545 #if defined(PETSC_HAVE_HYPRE) 3546 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3547 #endif 3548 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 3549 PetscFunctionReturn(0); 3550 } 3551 3552 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3553 { 3554 PetscFunctionBegin; 3555 PetscCall(MatCreate_SeqAIJ(B)); 3556 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 3557 PetscFunctionReturn(0); 3558 } 3559 3560 /*MC 3561 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3562 3563 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3564 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3565 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3566 3567 Options Database Keys: 3568 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3569 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3570 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3571 + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3572 3573 Level: beginner 3574 3575 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3576 M*/ 3577 3578 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3579 3580 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3581 { 3582 PetscFunctionBegin; 3583 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 3584 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 3585 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 3586 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 3587 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3588 3589 PetscFunctionReturn(0); 3590 } 3591 3592 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3593 { 3594 Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3595 3596 PetscFunctionBegin; 3597 if (!cusp) PetscFunctionReturn(0); 3598 delete cusp->cooPerm; 3599 delete cusp->cooPerm_a; 3600 cusp->cooPerm = NULL; 3601 cusp->cooPerm_a = NULL; 3602 if (cusp->use_extended_coo) { 3603 PetscCallCUDA(cudaFree(cusp->jmap_d)); 3604 PetscCallCUDA(cudaFree(cusp->perm_d)); 3605 } 3606 cusp->use_extended_coo = PETSC_FALSE; 3607 PetscFunctionReturn(0); 3608 } 3609 3610 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3611 { 3612 PetscFunctionBegin; 3613 if (*cusparsestruct) { 3614 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 3615 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 3616 delete (*cusparsestruct)->workVector; 3617 delete (*cusparsestruct)->rowoffsets_gpu; 3618 delete (*cusparsestruct)->cooPerm; 3619 delete (*cusparsestruct)->cooPerm_a; 3620 delete (*cusparsestruct)->csr2csc_i; 3621 if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 3622 if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 3623 if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 3624 PetscCall(PetscFree(*cusparsestruct)); 3625 } 3626 PetscFunctionReturn(0); 3627 } 3628 3629 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3630 { 3631 PetscFunctionBegin; 3632 if (*mat) { 3633 delete (*mat)->values; 3634 delete (*mat)->column_indices; 3635 delete (*mat)->row_offsets; 3636 delete *mat; 3637 *mat = 0; 3638 } 3639 PetscFunctionReturn(0); 3640 } 3641 3642 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3643 { 3644 PetscFunctionBegin; 3645 if (*trifactor) { 3646 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3647 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo)); 3648 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 3649 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 3650 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3651 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3652 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3653 #endif 3654 PetscCall(PetscFree(*trifactor)); 3655 } 3656 PetscFunctionReturn(0); 3657 } 3658 3659 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3660 { 3661 CsrMatrix *mat; 3662 3663 PetscFunctionBegin; 3664 if (*matstruct) { 3665 if ((*matstruct)->mat) { 3666 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3667 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3668 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3669 #else 3670 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3671 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3672 #endif 3673 } else { 3674 mat = (CsrMatrix*)(*matstruct)->mat; 3675 CsrMatrix_Destroy(&mat); 3676 } 3677 } 3678 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 3679 delete (*matstruct)->cprowIndices; 3680 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 3681 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 3682 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3683 3684 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3685 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3686 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3687 for (int i=0; i<3; i++) { 3688 if (mdata->cuSpMV[i].initialized) { 3689 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 3690 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 3691 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3692 } 3693 } 3694 #endif 3695 delete *matstruct; 3696 *matstruct = NULL; 3697 } 3698 PetscFunctionReturn(0); 3699 } 3700 3701 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3702 { 3703 PetscFunctionBegin; 3704 if (*trifactors) { 3705 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 3706 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 3707 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 3708 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 3709 delete (*trifactors)->rpermIndices; 3710 delete (*trifactors)->cpermIndices; 3711 delete (*trifactors)->workVector; 3712 (*trifactors)->rpermIndices = NULL; 3713 (*trifactors)->cpermIndices = NULL; 3714 (*trifactors)->workVector = NULL; 3715 if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 3716 if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3717 (*trifactors)->init_dev_prop = PETSC_FALSE; 3718 } 3719 PetscFunctionReturn(0); 3720 } 3721 3722 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3723 { 3724 cusparseHandle_t handle; 3725 3726 PetscFunctionBegin; 3727 if (*trifactors) { 3728 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3729 if (handle = (*trifactors)->handle) { 3730 PetscCallCUSPARSE(cusparseDestroy(handle)); 3731 } 3732 PetscCall(PetscFree(*trifactors)); 3733 } 3734 PetscFunctionReturn(0); 3735 } 3736 3737 struct IJCompare 3738 { 3739 __host__ __device__ 3740 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3741 { 3742 if (t1.get<0>() < t2.get<0>()) return true; 3743 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3744 return false; 3745 } 3746 }; 3747 3748 struct IJEqual 3749 { 3750 __host__ __device__ 3751 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3752 { 3753 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3754 return true; 3755 } 3756 }; 3757 3758 struct IJDiff 3759 { 3760 __host__ __device__ 3761 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3762 { 3763 return t1 == t2 ? 0 : 1; 3764 } 3765 }; 3766 3767 struct IJSum 3768 { 3769 __host__ __device__ 3770 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3771 { 3772 return t1||t2; 3773 } 3774 }; 3775 3776 #include <thrust/iterator/discard_iterator.h> 3777 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3778 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 3779 { 3780 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3782 THRUSTARRAY *cooPerm_v = NULL; 3783 thrust::device_ptr<const PetscScalar> d_v; 3784 CsrMatrix *matrix; 3785 PetscInt n; 3786 3787 PetscFunctionBegin; 3788 PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3789 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3790 if (!cusp->cooPerm) { 3791 PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 3792 PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 3793 PetscFunctionReturn(0); 3794 } 3795 matrix = (CsrMatrix*)cusp->mat->mat; 3796 PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3797 if (!v) { 3798 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3799 goto finalize; 3800 } 3801 n = cusp->cooPerm->size(); 3802 if (isCudaMem(v)) { 3803 d_v = thrust::device_pointer_cast(v); 3804 } else { 3805 cooPerm_v = new THRUSTARRAY(n); 3806 cooPerm_v->assign(v,v+n); 3807 d_v = cooPerm_v->data(); 3808 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 3809 } 3810 PetscCall(PetscLogGpuTimeBegin()); 3811 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3812 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3813 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3814 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3815 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3816 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3817 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3818 */ 3819 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3820 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3821 delete cooPerm_w; 3822 } else { 3823 /* all nonzeros in d_v[] are unique entries */ 3824 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3825 matrix->values->begin())); 3826 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3827 matrix->values->end())); 3828 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3829 } 3830 } else { 3831 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3832 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3833 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3834 } else { 3835 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3836 matrix->values->begin())); 3837 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3838 matrix->values->end())); 3839 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3840 } 3841 } 3842 PetscCall(PetscLogGpuTimeEnd()); 3843 finalize: 3844 delete cooPerm_v; 3845 A->offloadmask = PETSC_OFFLOAD_GPU; 3846 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3847 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3848 PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 3849 PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 3850 PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3851 a->reallocs = 0; 3852 A->info.mallocs += 0; 3853 A->info.nz_unneeded = 0; 3854 A->assembled = A->was_assembled = PETSC_TRUE; 3855 A->num_ass++; 3856 PetscFunctionReturn(0); 3857 } 3858 3859 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3860 { 3861 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3862 3863 PetscFunctionBegin; 3864 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3865 if (!cusp) PetscFunctionReturn(0); 3866 if (destroy) { 3867 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3868 delete cusp->csr2csc_i; 3869 cusp->csr2csc_i = NULL; 3870 } 3871 A->transupdated = PETSC_FALSE; 3872 PetscFunctionReturn(0); 3873 } 3874 3875 #include <thrust/binary_search.h> 3876 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3877 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 3878 { 3879 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3880 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3881 PetscInt cooPerm_n, nzr = 0; 3882 3883 PetscFunctionBegin; 3884 PetscCall(PetscLayoutSetUp(A->rmap)); 3885 PetscCall(PetscLayoutSetUp(A->cmap)); 3886 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3887 if (n != cooPerm_n) { 3888 delete cusp->cooPerm; 3889 delete cusp->cooPerm_a; 3890 cusp->cooPerm = NULL; 3891 cusp->cooPerm_a = NULL; 3892 } 3893 if (n) { 3894 THRUSTINTARRAY d_i(n); 3895 THRUSTINTARRAY d_j(n); 3896 THRUSTINTARRAY ii(A->rmap->n); 3897 3898 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3899 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3900 3901 PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 3902 d_i.assign(coo_i,coo_i+n); 3903 d_j.assign(coo_j,coo_j+n); 3904 3905 /* Ex. 3906 n = 6 3907 coo_i = [3,3,1,4,1,4] 3908 coo_j = [3,2,2,5,2,6] 3909 */ 3910 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3911 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3912 3913 PetscCall(PetscLogGpuTimeBegin()); 3914 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3915 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3916 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3917 THRUSTINTARRAY w = d_j; 3918 3919 /* 3920 d_i = [1,1,3,3,4,4] 3921 d_j = [2,2,2,3,5,6] 3922 cooPerm = [2,4,1,0,3,5] 3923 */ 3924 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3925 3926 /* 3927 d_i = [1,3,3,4,4,x] 3928 ^ekey 3929 d_j = [2,2,3,5,6,x] 3930 ^nekye 3931 */ 3932 if (nekey == ekey) { /* all entries are unique */ 3933 delete cusp->cooPerm_a; 3934 cusp->cooPerm_a = NULL; 3935 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3936 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3937 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3938 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3939 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3940 w[0] = 0; 3941 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3942 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3943 } 3944 thrust::counting_iterator<PetscInt> search_begin(0); 3945 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3946 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3947 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3948 PetscCall(PetscLogGpuTimeEnd()); 3949 3950 PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 3951 a->singlemalloc = PETSC_FALSE; 3952 a->free_a = PETSC_TRUE; 3953 a->free_ij = PETSC_TRUE; 3954 PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3955 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3956 PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3957 a->nz = a->maxnz = a->i[A->rmap->n]; 3958 a->rmax = 0; 3959 PetscCall(PetscMalloc1(a->nz,&a->a)); 3960 PetscCall(PetscMalloc1(a->nz,&a->j)); 3961 PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3962 if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 3963 if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 3964 for (PetscInt i = 0; i < A->rmap->n; i++) { 3965 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3966 nzr += (PetscInt)!!(nnzr); 3967 a->ilen[i] = a->imax[i] = nnzr; 3968 a->rmax = PetscMax(a->rmax,nnzr); 3969 } 3970 a->nonzerorowcnt = nzr; 3971 A->preallocated = PETSC_TRUE; 3972 PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 3973 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 3974 } else { 3975 PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 3976 } 3977 PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 3978 3979 /* We want to allocate the CUSPARSE struct for matvec now. 3980 The code is so convoluted now that I prefer to copy zeros */ 3981 PetscCall(PetscArrayzero(a->a,a->nz)); 3982 PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 3983 A->offloadmask = PETSC_OFFLOAD_CPU; 3984 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3985 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 3986 PetscFunctionReturn(0); 3987 } 3988 3989 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3990 { 3991 Mat_SeqAIJ *seq; 3992 Mat_SeqAIJCUSPARSE *dev; 3993 PetscBool coo_basic = PETSC_TRUE; 3994 PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3995 3996 PetscFunctionBegin; 3997 PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 3998 PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 3999 if (coo_i) { 4000 PetscCall(PetscGetMemType(coo_i,&mtype)); 4001 if (PetscMemTypeHost(mtype)) { 4002 for (PetscCount k=0; k<coo_n; k++) { 4003 if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4004 } 4005 } 4006 } 4007 4008 if (coo_basic) { /* i,j are on device or do not contain negative indices */ 4009 PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4010 } else { 4011 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4012 mat->offloadmask = PETSC_OFFLOAD_CPU; 4013 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4014 seq = static_cast<Mat_SeqAIJ*>(mat->data); 4015 dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 4016 PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 4017 PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4018 PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 4019 PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4020 dev->use_extended_coo = PETSC_TRUE; 4021 } 4022 PetscFunctionReturn(0); 4023 } 4024 4025 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4026 { 4027 PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4028 const PetscCount grid_size = gridDim.x * blockDim.x; 4029 for (; i<nnz; i+= grid_size) { 4030 PetscScalar sum = 0.0; 4031 for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4032 a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4033 } 4034 } 4035 4036 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4037 { 4038 Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4039 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4040 PetscCount Annz = seq->nz; 4041 PetscMemType memtype; 4042 const PetscScalar *v1 = v; 4043 PetscScalar *Aa; 4044 4045 PetscFunctionBegin; 4046 if (dev->use_extended_coo) { 4047 PetscCall(PetscGetMemType(v,&memtype)); 4048 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4049 PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 4050 PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4051 } 4052 4053 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 4054 else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4055 4056 if (Annz) { 4057 MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 4058 PetscCallCUDA(cudaPeekAtLastError()); 4059 } 4060 4061 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 4062 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4063 4064 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4065 } else { 4066 PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4067 } 4068 PetscFunctionReturn(0); 4069 } 4070 4071 /*@C 4072 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 4073 4074 Not collective 4075 4076 Input Parameters: 4077 + A - the matrix 4078 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4079 4080 Output Parameters: 4081 + ia - the CSR row pointers 4082 - ja - the CSR column indices 4083 4084 Level: developer 4085 4086 Notes: 4087 When compressed is true, the CSR structure does not contain empty rows 4088 4089 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4090 @*/ 4091 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4092 { 4093 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4094 CsrMatrix *csr; 4095 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4096 4097 PetscFunctionBegin; 4098 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4099 if (!i || !j) PetscFunctionReturn(0); 4100 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4101 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4102 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4103 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4104 csr = (CsrMatrix*)cusp->mat->mat; 4105 if (i) { 4106 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4107 if (!cusp->rowoffsets_gpu) { 4108 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4109 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4110 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4111 } 4112 *i = cusp->rowoffsets_gpu->data().get(); 4113 } else *i = csr->row_offsets->data().get(); 4114 } 4115 if (j) *j = csr->column_indices->data().get(); 4116 PetscFunctionReturn(0); 4117 } 4118 4119 /*@C 4120 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4121 4122 Not collective 4123 4124 Input Parameters: 4125 + A - the matrix 4126 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4127 4128 Output Parameters: 4129 + ia - the CSR row pointers 4130 - ja - the CSR column indices 4131 4132 Level: developer 4133 4134 .seealso: `MatSeqAIJCUSPARSEGetIJ()` 4135 @*/ 4136 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4137 { 4138 PetscFunctionBegin; 4139 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4140 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4141 if (i) *i = NULL; 4142 if (j) *j = NULL; 4143 PetscFunctionReturn(0); 4144 } 4145 4146 /*@C 4147 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4148 4149 Not Collective 4150 4151 Input Parameter: 4152 . A - a MATSEQAIJCUSPARSE matrix 4153 4154 Output Parameter: 4155 . a - pointer to the device data 4156 4157 Level: developer 4158 4159 Notes: may trigger host-device copies if up-to-date matrix data is on host 4160 4161 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4162 @*/ 4163 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4164 { 4165 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4166 CsrMatrix *csr; 4167 4168 PetscFunctionBegin; 4169 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4170 PetscValidPointer(a,2); 4171 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4172 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4173 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4174 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4175 csr = (CsrMatrix*)cusp->mat->mat; 4176 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4177 *a = csr->values->data().get(); 4178 PetscFunctionReturn(0); 4179 } 4180 4181 /*@C 4182 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4183 4184 Not Collective 4185 4186 Input Parameter: 4187 . A - a MATSEQAIJCUSPARSE matrix 4188 4189 Output Parameter: 4190 . a - pointer to the device data 4191 4192 Level: developer 4193 4194 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 4195 @*/ 4196 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4197 { 4198 PetscFunctionBegin; 4199 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4200 PetscValidPointer(a,2); 4201 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4202 *a = NULL; 4203 PetscFunctionReturn(0); 4204 } 4205 4206 /*@C 4207 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4208 4209 Not Collective 4210 4211 Input Parameter: 4212 . A - a MATSEQAIJCUSPARSE matrix 4213 4214 Output Parameter: 4215 . a - pointer to the device data 4216 4217 Level: developer 4218 4219 Notes: may trigger host-device copies if up-to-date matrix data is on host 4220 4221 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4222 @*/ 4223 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4224 { 4225 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4226 CsrMatrix *csr; 4227 4228 PetscFunctionBegin; 4229 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4230 PetscValidPointer(a,2); 4231 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4232 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4233 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4234 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4235 csr = (CsrMatrix*)cusp->mat->mat; 4236 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4237 *a = csr->values->data().get(); 4238 A->offloadmask = PETSC_OFFLOAD_GPU; 4239 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4240 PetscFunctionReturn(0); 4241 } 4242 /*@C 4243 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4244 4245 Not Collective 4246 4247 Input Parameter: 4248 . A - a MATSEQAIJCUSPARSE matrix 4249 4250 Output Parameter: 4251 . a - pointer to the device data 4252 4253 Level: developer 4254 4255 .seealso: `MatSeqAIJCUSPARSEGetArray()` 4256 @*/ 4257 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4258 { 4259 PetscFunctionBegin; 4260 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4261 PetscValidPointer(a,2); 4262 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4263 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4264 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4265 *a = NULL; 4266 PetscFunctionReturn(0); 4267 } 4268 4269 /*@C 4270 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4271 4272 Not Collective 4273 4274 Input Parameter: 4275 . A - a MATSEQAIJCUSPARSE matrix 4276 4277 Output Parameter: 4278 . a - pointer to the device data 4279 4280 Level: developer 4281 4282 Notes: does not trigger host-device copies and flags data validity on the GPU 4283 4284 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4285 @*/ 4286 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4287 { 4288 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4289 CsrMatrix *csr; 4290 4291 PetscFunctionBegin; 4292 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4293 PetscValidPointer(a,2); 4294 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4295 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4296 PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4297 csr = (CsrMatrix*)cusp->mat->mat; 4298 PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4299 *a = csr->values->data().get(); 4300 A->offloadmask = PETSC_OFFLOAD_GPU; 4301 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4302 PetscFunctionReturn(0); 4303 } 4304 4305 /*@C 4306 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4307 4308 Not Collective 4309 4310 Input Parameter: 4311 . A - a MATSEQAIJCUSPARSE matrix 4312 4313 Output Parameter: 4314 . a - pointer to the device data 4315 4316 Level: developer 4317 4318 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 4319 @*/ 4320 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4321 { 4322 PetscFunctionBegin; 4323 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4324 PetscValidPointer(a,2); 4325 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4326 PetscCall(MatSeqAIJInvalidateDiagonal(A)); 4327 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4328 *a = NULL; 4329 PetscFunctionReturn(0); 4330 } 4331 4332 struct IJCompare4 4333 { 4334 __host__ __device__ 4335 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4336 { 4337 if (t1.get<0>() < t2.get<0>()) return true; 4338 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4339 return false; 4340 } 4341 }; 4342 4343 struct Shift 4344 { 4345 int _shift; 4346 4347 Shift(int shift) : _shift(shift) {} 4348 __host__ __device__ 4349 inline int operator() (const int &c) 4350 { 4351 return c + _shift; 4352 } 4353 }; 4354 4355 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4356 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4357 { 4358 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4359 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4360 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4361 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4362 PetscInt Annz,Bnnz; 4363 cusparseStatus_t stat; 4364 PetscInt i,m,n,zero = 0; 4365 4366 PetscFunctionBegin; 4367 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4368 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4369 PetscValidPointer(C,4); 4370 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4371 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4372 PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 4373 PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4374 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4375 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4376 if (reuse == MAT_INITIAL_MATRIX) { 4377 m = A->rmap->n; 4378 n = A->cmap->n + B->cmap->n; 4379 PetscCall(MatCreate(PETSC_COMM_SELF,C)); 4380 PetscCall(MatSetSizes(*C,m,n,m,n)); 4381 PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4382 c = (Mat_SeqAIJ*)(*C)->data; 4383 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4384 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4385 Ccsr = new CsrMatrix; 4386 Cmat->cprowIndices = NULL; 4387 c->compressedrow.use = PETSC_FALSE; 4388 c->compressedrow.nrows = 0; 4389 c->compressedrow.i = NULL; 4390 c->compressedrow.rindex = NULL; 4391 Ccusp->workVector = NULL; 4392 Ccusp->nrows = m; 4393 Ccusp->mat = Cmat; 4394 Ccusp->mat->mat = Ccsr; 4395 Ccsr->num_rows = m; 4396 Ccsr->num_cols = n; 4397 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4398 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4399 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4400 PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 4401 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 4402 PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 4403 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4404 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4405 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4406 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4407 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4408 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4409 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4410 4411 Acsr = (CsrMatrix*)Acusp->mat->mat; 4412 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4413 Annz = (PetscInt)Acsr->column_indices->size(); 4414 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4415 c->nz = Annz + Bnnz; 4416 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4417 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4418 Ccsr->values = new THRUSTARRAY(c->nz); 4419 Ccsr->num_entries = c->nz; 4420 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4421 if (c->nz) { 4422 auto Acoo = new THRUSTINTARRAY32(Annz); 4423 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4424 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4425 THRUSTINTARRAY32 *Aroff,*Broff; 4426 4427 if (a->compressedrow.use) { /* need full row offset */ 4428 if (!Acusp->rowoffsets_gpu) { 4429 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4430 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4431 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4432 } 4433 Aroff = Acusp->rowoffsets_gpu; 4434 } else Aroff = Acsr->row_offsets; 4435 if (b->compressedrow.use) { /* need full row offset */ 4436 if (!Bcusp->rowoffsets_gpu) { 4437 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4438 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4439 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4440 } 4441 Broff = Bcusp->rowoffsets_gpu; 4442 } else Broff = Bcsr->row_offsets; 4443 PetscCall(PetscLogGpuTimeBegin()); 4444 stat = cusparseXcsr2coo(Acusp->handle, 4445 Aroff->data().get(), 4446 Annz, 4447 m, 4448 Acoo->data().get(), 4449 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4450 stat = cusparseXcsr2coo(Bcusp->handle, 4451 Broff->data().get(), 4452 Bnnz, 4453 m, 4454 Bcoo->data().get(), 4455 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4456 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4457 auto Aperm = thrust::make_constant_iterator(1); 4458 auto Bperm = thrust::make_constant_iterator(0); 4459 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4460 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4461 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4462 #else 4463 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4464 auto Bcib = Bcsr->column_indices->begin(); 4465 auto Bcie = Bcsr->column_indices->end(); 4466 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4467 #endif 4468 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4469 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4470 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4471 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4472 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4473 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4474 auto p1 = Ccusp->cooPerm->begin(); 4475 auto p2 = Ccusp->cooPerm->begin(); 4476 thrust::advance(p2,Annz); 4477 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4478 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4479 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4480 #endif 4481 auto cci = thrust::make_counting_iterator(zero); 4482 auto cce = thrust::make_counting_iterator(c->nz); 4483 #if 0 //Errors on SUMMIT cuda 11.1.0 4484 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4485 #else 4486 auto pred = thrust::identity<int>(); 4487 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4488 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4489 #endif 4490 stat = cusparseXcoo2csr(Ccusp->handle, 4491 Ccoo->data().get(), 4492 c->nz, 4493 m, 4494 Ccsr->row_offsets->data().get(), 4495 CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4496 PetscCall(PetscLogGpuTimeEnd()); 4497 delete wPerm; 4498 delete Acoo; 4499 delete Bcoo; 4500 delete Ccoo; 4501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4502 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4503 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4504 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4505 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4506 #endif 4507 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4508 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4509 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4510 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4511 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4512 CsrMatrix *CcsrT = new CsrMatrix; 4513 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4514 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4515 4516 (*C)->form_explicit_transpose = PETSC_TRUE; 4517 (*C)->transupdated = PETSC_TRUE; 4518 Ccusp->rowoffsets_gpu = NULL; 4519 CmatT->cprowIndices = NULL; 4520 CmatT->mat = CcsrT; 4521 CcsrT->num_rows = n; 4522 CcsrT->num_cols = m; 4523 CcsrT->num_entries = c->nz; 4524 4525 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4526 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4527 CcsrT->values = new THRUSTARRAY(c->nz); 4528 4529 PetscCall(PetscLogGpuTimeBegin()); 4530 auto rT = CcsrT->row_offsets->begin(); 4531 if (AT) { 4532 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4533 thrust::advance(rT,-1); 4534 } 4535 if (BT) { 4536 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4537 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4538 thrust::copy(titb,tite,rT); 4539 } 4540 auto cT = CcsrT->column_indices->begin(); 4541 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4542 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4543 auto vT = CcsrT->values->begin(); 4544 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4545 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4546 PetscCall(PetscLogGpuTimeEnd()); 4547 4548 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4549 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4550 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4551 PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 4552 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 4553 PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 4554 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4555 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4556 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4557 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4558 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4559 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4560 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4561 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4562 #endif 4563 Ccusp->matTranspose = CmatT; 4564 } 4565 } 4566 4567 c->singlemalloc = PETSC_FALSE; 4568 c->free_a = PETSC_TRUE; 4569 c->free_ij = PETSC_TRUE; 4570 PetscCall(PetscMalloc1(m+1,&c->i)); 4571 PetscCall(PetscMalloc1(c->nz,&c->j)); 4572 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4573 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4574 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4575 ii = *Ccsr->row_offsets; 4576 jj = *Ccsr->column_indices; 4577 PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4578 PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4579 } else { 4580 PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4581 PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4582 } 4583 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 4584 PetscCall(PetscMalloc1(m,&c->ilen)); 4585 PetscCall(PetscMalloc1(m,&c->imax)); 4586 c->maxnz = c->nz; 4587 c->nonzerorowcnt = 0; 4588 c->rmax = 0; 4589 for (i = 0; i < m; i++) { 4590 const PetscInt nn = c->i[i+1] - c->i[i]; 4591 c->ilen[i] = c->imax[i] = nn; 4592 c->nonzerorowcnt += (PetscInt)!!nn; 4593 c->rmax = PetscMax(c->rmax,nn); 4594 } 4595 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4596 PetscCall(PetscMalloc1(c->nz,&c->a)); 4597 (*C)->nonzerostate++; 4598 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4599 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4600 Ccusp->nonzerostate = (*C)->nonzerostate; 4601 (*C)->preallocated = PETSC_TRUE; 4602 } else { 4603 PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4604 c = (Mat_SeqAIJ*)(*C)->data; 4605 if (c->nz) { 4606 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4607 PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4608 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4609 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4610 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4611 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4612 PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4613 PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4614 Acsr = (CsrMatrix*)Acusp->mat->mat; 4615 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4616 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4617 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4618 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4619 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4620 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4621 PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4622 auto pmid = Ccusp->cooPerm->begin(); 4623 thrust::advance(pmid,Acsr->num_entries); 4624 PetscCall(PetscLogGpuTimeBegin()); 4625 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4626 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4627 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4628 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4629 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4630 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4631 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4632 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4633 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4634 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4635 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 4636 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4637 PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4638 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4639 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4640 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4641 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4642 auto vT = CcsrT->values->begin(); 4643 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4644 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4645 (*C)->transupdated = PETSC_TRUE; 4646 } 4647 PetscCall(PetscLogGpuTimeEnd()); 4648 } 4649 } 4650 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4651 (*C)->assembled = PETSC_TRUE; 4652 (*C)->was_assembled = PETSC_FALSE; 4653 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4654 PetscFunctionReturn(0); 4655 } 4656 4657 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4658 { 4659 bool dmem; 4660 const PetscScalar *av; 4661 4662 PetscFunctionBegin; 4663 dmem = isCudaMem(v); 4664 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4665 if (n && idx) { 4666 THRUSTINTARRAY widx(n); 4667 widx.assign(idx,idx+n); 4668 PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4669 4670 THRUSTARRAY *w = NULL; 4671 thrust::device_ptr<PetscScalar> dv; 4672 if (dmem) { 4673 dv = thrust::device_pointer_cast(v); 4674 } else { 4675 w = new THRUSTARRAY(n); 4676 dv = w->data(); 4677 } 4678 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4679 4680 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4681 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4682 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4683 if (w) { 4684 PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4685 } 4686 delete w; 4687 } else { 4688 PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4689 } 4690 if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 4691 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4692 PetscFunctionReturn(0); 4693 } 4694