1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 #include <thrust/async/for_each.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 76 77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 82 83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 86 87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 89 90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 91 92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 93 { 94 cusparseStatus_t stat; 95 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 96 97 PetscFunctionBegin; 98 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 99 cusparsestruct->stream = stream; 100 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 101 PetscFunctionReturn(0); 102 } 103 104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 105 { 106 cusparseStatus_t stat; 107 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 108 109 PetscFunctionBegin; 110 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 111 if (cusparsestruct->handle != handle) { 112 if (cusparsestruct->handle) { 113 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 114 } 115 cusparsestruct->handle = handle; 116 } 117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 118 PetscFunctionReturn(0); 119 } 120 121 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 122 { 123 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 124 PetscBool flg; 125 PetscErrorCode ierr; 126 127 PetscFunctionBegin; 128 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 129 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 130 if (cusparsestruct->handle) cusparsestruct->handle = 0; 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 135 { 136 PetscFunctionBegin; 137 *type = MATSOLVERCUSPARSE; 138 PetscFunctionReturn(0); 139 } 140 141 /*MC 142 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 143 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 144 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 145 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 146 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 147 algorithms are not recommended. This class does NOT support direct solver operations. 148 149 Level: beginner 150 151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 152 M*/ 153 154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 155 { 156 PetscErrorCode ierr; 157 PetscInt n = A->rmap->n; 158 159 PetscFunctionBegin; 160 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 161 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 162 (*B)->factortype = ftype; 163 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 164 165 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 166 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 168 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 169 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 170 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr); 171 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr); 172 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 173 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 174 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 175 ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr); 176 ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr); 177 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 178 179 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 180 (*B)->canuseordering = PETSC_TRUE; 181 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 186 { 187 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 188 189 PetscFunctionBegin; 190 switch (op) { 191 case MAT_CUSPARSE_MULT: 192 cusparsestruct->format = format; 193 break; 194 case MAT_CUSPARSE_ALL: 195 cusparsestruct->format = format; 196 break; 197 default: 198 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 199 } 200 PetscFunctionReturn(0); 201 } 202 203 /*@ 204 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 205 operation. Only the MatMult operation can use different GPU storage formats 206 for MPIAIJCUSPARSE matrices. 207 Not Collective 208 209 Input Parameters: 210 + A - Matrix of type SEQAIJCUSPARSE 211 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 212 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 213 214 Output Parameter: 215 216 Level: intermediate 217 218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 219 @*/ 220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 221 { 222 PetscErrorCode ierr; 223 224 PetscFunctionBegin; 225 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 226 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 227 PetscFunctionReturn(0); 228 } 229 230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 231 { 232 PetscErrorCode ierr; 233 234 PetscFunctionBegin; 235 switch (op) { 236 case MAT_FORM_EXPLICIT_TRANSPOSE: 237 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 238 if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);} 239 A->form_explicit_transpose = flg; 240 break; 241 default: 242 ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 243 break; 244 } 245 PetscFunctionReturn(0); 246 } 247 248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 249 250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 251 { 252 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 253 IS isrow = b->row,iscol = b->col; 254 PetscBool row_identity,col_identity; 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 259 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 260 B->offloadmask = PETSC_OFFLOAD_CPU; 261 /* determine which version of MatSolve needs to be used. */ 262 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 263 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 264 if (row_identity && col_identity) { 265 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 266 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 267 B->ops->matsolve = NULL; 268 B->ops->matsolvetranspose = NULL; 269 } else { 270 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 271 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 272 B->ops->matsolve = NULL; 273 B->ops->matsolvetranspose = NULL; 274 } 275 276 /* get the triangular factors */ 277 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 282 { 283 PetscErrorCode ierr; 284 MatCUSPARSEStorageFormat format; 285 PetscBool flg; 286 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 287 288 PetscFunctionBegin; 289 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 290 if (A->factortype == MAT_FACTOR_NONE) { 291 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 292 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 293 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 294 295 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 296 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 297 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 299 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 300 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 301 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 303 if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 304 #else 305 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 306 #endif 307 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 308 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 309 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 310 311 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 312 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 313 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 314 #endif 315 } 316 ierr = PetscOptionsTail();CHKERRQ(ierr); 317 PetscFunctionReturn(0); 318 } 319 320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 321 { 322 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 323 PetscErrorCode ierr; 324 325 PetscFunctionBegin; 326 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 327 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 328 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 329 PetscFunctionReturn(0); 330 } 331 332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 333 { 334 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 335 PetscErrorCode ierr; 336 337 PetscFunctionBegin; 338 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 339 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 340 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 345 { 346 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 347 PetscErrorCode ierr; 348 349 PetscFunctionBegin; 350 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 351 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 352 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 353 PetscFunctionReturn(0); 354 } 355 356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 357 { 358 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 363 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 364 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 365 PetscFunctionReturn(0); 366 } 367 368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 369 { 370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 371 PetscInt n = A->rmap->n; 372 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 373 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 374 cusparseStatus_t stat; 375 const PetscInt *ai = a->i,*aj = a->j,*vi; 376 const MatScalar *aa = a->a,*v; 377 PetscInt *AiLo, *AjLo; 378 PetscInt i,nz, nzLower, offset, rowOffset; 379 PetscErrorCode ierr; 380 cudaError_t cerr; 381 382 PetscFunctionBegin; 383 if (!n) PetscFunctionReturn(0); 384 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 385 try { 386 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 387 nzLower=n+ai[n]-ai[1]; 388 if (!loTriFactor) { 389 PetscScalar *AALo; 390 391 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 392 393 /* Allocate Space for the lower triangular matrix */ 394 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 395 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 396 397 /* Fill the lower triangular matrix */ 398 AiLo[0] = (PetscInt) 0; 399 AiLo[n] = nzLower; 400 AjLo[0] = (PetscInt) 0; 401 AALo[0] = (MatScalar) 1.0; 402 v = aa; 403 vi = aj; 404 offset = 1; 405 rowOffset= 1; 406 for (i=1; i<n; i++) { 407 nz = ai[i+1] - ai[i]; 408 /* additional 1 for the term on the diagonal */ 409 AiLo[i] = rowOffset; 410 rowOffset += nz+1; 411 412 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 413 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 414 415 offset += nz; 416 AjLo[offset] = (PetscInt) i; 417 AALo[offset] = (MatScalar) 1.0; 418 offset += 1; 419 420 v += nz; 421 vi += nz; 422 } 423 424 /* allocate space for the triangular factor information */ 425 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 426 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 427 /* Create the matrix description */ 428 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 429 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 430 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 431 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 432 #else 433 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 434 #endif 435 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 436 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 437 438 /* set the operation */ 439 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 440 441 /* set the matrix */ 442 loTriFactor->csrMat = new CsrMatrix; 443 loTriFactor->csrMat->num_rows = n; 444 loTriFactor->csrMat->num_cols = n; 445 loTriFactor->csrMat->num_entries = nzLower; 446 447 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 448 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 449 450 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 451 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 452 453 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 454 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 455 456 /* Create the solve analysis information */ 457 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 459 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 460 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 461 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 462 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 463 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 464 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 465 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 466 #endif 467 468 /* perform the solve analysis */ 469 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 470 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 471 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 472 loTriFactor->csrMat->column_indices->data().get(), 473 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 474 loTriFactor->solveInfo, 475 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 476 #else 477 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 478 #endif 479 cerr = WaitForCUDA();CHKERRCUDA(cerr); 480 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 481 482 /* assign the pointer */ 483 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 484 loTriFactor->AA_h = AALo; 485 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 486 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 487 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 488 } else { /* update values only */ 489 if (!loTriFactor->AA_h) { 490 cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 491 } 492 /* Fill the lower triangular matrix */ 493 loTriFactor->AA_h[0] = 1.0; 494 v = aa; 495 vi = aj; 496 offset = 1; 497 for (i=1; i<n; i++) { 498 nz = ai[i+1] - ai[i]; 499 ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 500 offset += nz; 501 loTriFactor->AA_h[offset] = 1.0; 502 offset += 1; 503 v += nz; 504 } 505 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 506 ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 507 } 508 } catch(char *ex) { 509 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 510 } 511 } 512 PetscFunctionReturn(0); 513 } 514 515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 516 { 517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 518 PetscInt n = A->rmap->n; 519 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 520 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 521 cusparseStatus_t stat; 522 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 523 const MatScalar *aa = a->a,*v; 524 PetscInt *AiUp, *AjUp; 525 PetscInt i,nz, nzUpper, offset; 526 PetscErrorCode ierr; 527 cudaError_t cerr; 528 529 PetscFunctionBegin; 530 if (!n) PetscFunctionReturn(0); 531 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 532 try { 533 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 534 nzUpper = adiag[0]-adiag[n]; 535 if (!upTriFactor) { 536 PetscScalar *AAUp; 537 538 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 539 540 /* Allocate Space for the upper triangular matrix */ 541 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 542 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 543 544 /* Fill the upper triangular matrix */ 545 AiUp[0]=(PetscInt) 0; 546 AiUp[n]=nzUpper; 547 offset = nzUpper; 548 for (i=n-1; i>=0; i--) { 549 v = aa + adiag[i+1] + 1; 550 vi = aj + adiag[i+1] + 1; 551 552 /* number of elements NOT on the diagonal */ 553 nz = adiag[i] - adiag[i+1]-1; 554 555 /* decrement the offset */ 556 offset -= (nz+1); 557 558 /* first, set the diagonal elements */ 559 AjUp[offset] = (PetscInt) i; 560 AAUp[offset] = (MatScalar)1./v[nz]; 561 AiUp[i] = AiUp[i+1] - (nz+1); 562 563 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 564 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 565 } 566 567 /* allocate space for the triangular factor information */ 568 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 569 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 573 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 574 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 575 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 576 #else 577 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 578 #endif 579 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 580 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 581 582 /* set the operation */ 583 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 584 585 /* set the matrix */ 586 upTriFactor->csrMat = new CsrMatrix; 587 upTriFactor->csrMat->num_rows = n; 588 upTriFactor->csrMat->num_cols = n; 589 upTriFactor->csrMat->num_entries = nzUpper; 590 591 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 592 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 593 594 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 595 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 596 597 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 598 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 599 600 /* Create the solve analysis information */ 601 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 602 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 603 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 604 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 605 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 606 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 607 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 608 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 609 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 610 #endif 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 614 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 615 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 616 upTriFactor->csrMat->column_indices->data().get(), 617 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 618 upTriFactor->solveInfo, 619 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 620 #else 621 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 622 #endif 623 cerr = WaitForCUDA();CHKERRCUDA(cerr); 624 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 625 626 /* assign the pointer */ 627 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 628 upTriFactor->AA_h = AAUp; 629 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 630 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 631 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 632 } else { 633 if (!upTriFactor->AA_h) { 634 cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 635 } 636 /* Fill the upper triangular matrix */ 637 offset = nzUpper; 638 for (i=n-1; i>=0; i--) { 639 v = aa + adiag[i+1] + 1; 640 641 /* number of elements NOT on the diagonal */ 642 nz = adiag[i] - adiag[i+1]-1; 643 644 /* decrement the offset */ 645 offset -= (nz+1); 646 647 /* first, set the diagonal elements */ 648 upTriFactor->AA_h[offset] = 1./v[nz]; 649 ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 650 } 651 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 652 ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 653 } 654 } catch(char *ex) { 655 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 662 { 663 PetscErrorCode ierr; 664 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 665 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 666 IS isrow = a->row,iscol = a->icol; 667 PetscBool row_identity,col_identity; 668 PetscInt n = A->rmap->n; 669 670 PetscFunctionBegin; 671 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 672 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 673 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 674 675 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 676 cusparseTriFactors->nnz=a->nz; 677 678 A->offloadmask = PETSC_OFFLOAD_BOTH; 679 /* lower triangular indices */ 680 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 681 if (!row_identity && !cusparseTriFactors->rpermIndices) { 682 const PetscInt *r; 683 684 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 685 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 686 cusparseTriFactors->rpermIndices->assign(r, r+n); 687 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 688 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 689 } 690 691 /* upper triangular indices */ 692 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 693 if (!col_identity && !cusparseTriFactors->cpermIndices) { 694 const PetscInt *c; 695 696 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 697 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 698 cusparseTriFactors->cpermIndices->assign(c, c+n); 699 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 700 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 701 } 702 PetscFunctionReturn(0); 703 } 704 705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 706 { 707 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 708 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 709 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 710 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 711 cusparseStatus_t stat; 712 PetscErrorCode ierr; 713 cudaError_t cerr; 714 PetscInt *AiUp, *AjUp; 715 PetscScalar *AAUp; 716 PetscScalar *AALo; 717 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 718 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 719 const PetscInt *ai = b->i,*aj = b->j,*vj; 720 const MatScalar *aa = b->a,*v; 721 722 PetscFunctionBegin; 723 if (!n) PetscFunctionReturn(0); 724 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 725 try { 726 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 727 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 728 if (!upTriFactor && !loTriFactor) { 729 /* Allocate Space for the upper triangular matrix */ 730 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 731 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 732 733 /* Fill the upper triangular matrix */ 734 AiUp[0]=(PetscInt) 0; 735 AiUp[n]=nzUpper; 736 offset = 0; 737 for (i=0; i<n; i++) { 738 /* set the pointers */ 739 v = aa + ai[i]; 740 vj = aj + ai[i]; 741 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 742 743 /* first, set the diagonal elements */ 744 AjUp[offset] = (PetscInt) i; 745 AAUp[offset] = (MatScalar)1.0/v[nz]; 746 AiUp[i] = offset; 747 AALo[offset] = (MatScalar)1.0/v[nz]; 748 749 offset+=1; 750 if (nz>0) { 751 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 752 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 753 for (j=offset; j<offset+nz; j++) { 754 AAUp[j] = -AAUp[j]; 755 AALo[j] = AAUp[j]/v[nz]; 756 } 757 offset+=nz; 758 } 759 } 760 761 /* allocate space for the triangular factor information */ 762 ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 763 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 764 765 /* Create the matrix description */ 766 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 767 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 768 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 769 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 770 #else 771 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 772 #endif 773 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 774 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 775 776 /* set the matrix */ 777 upTriFactor->csrMat = new CsrMatrix; 778 upTriFactor->csrMat->num_rows = A->rmap->n; 779 upTriFactor->csrMat->num_cols = A->cmap->n; 780 upTriFactor->csrMat->num_entries = a->nz; 781 782 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 783 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 784 785 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 786 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 787 788 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 789 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 790 791 /* set the operation */ 792 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 793 794 /* Create the solve analysis information */ 795 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 796 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 797 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 798 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 799 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 800 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 801 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 802 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 803 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 804 #endif 805 806 /* perform the solve analysis */ 807 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 808 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 809 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 810 upTriFactor->csrMat->column_indices->data().get(), 811 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 812 upTriFactor->solveInfo, 813 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 814 #else 815 upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 816 #endif 817 cerr = WaitForCUDA();CHKERRCUDA(cerr); 818 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 819 820 /* assign the pointer */ 821 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 822 823 /* allocate space for the triangular factor information */ 824 ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 825 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 826 827 /* Create the matrix description */ 828 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 829 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 830 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 831 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 832 #else 833 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 834 #endif 835 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 836 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 837 838 /* set the operation */ 839 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 840 841 /* set the matrix */ 842 loTriFactor->csrMat = new CsrMatrix; 843 loTriFactor->csrMat->num_rows = A->rmap->n; 844 loTriFactor->csrMat->num_cols = A->cmap->n; 845 loTriFactor->csrMat->num_entries = a->nz; 846 847 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 848 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 849 850 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 852 853 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 855 856 /* Create the solve analysis information */ 857 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 858 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 859 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 860 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 861 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 862 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 863 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 864 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 865 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 866 #endif 867 868 /* perform the solve analysis */ 869 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 870 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 871 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 872 loTriFactor->csrMat->column_indices->data().get(), 873 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 874 loTriFactor->solveInfo, 875 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 876 #else 877 loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 878 #endif 879 cerr = WaitForCUDA();CHKERRCUDA(cerr); 880 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 881 882 /* assign the pointer */ 883 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 884 885 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 886 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 887 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 888 } else { 889 /* Fill the upper triangular matrix */ 890 offset = 0; 891 for (i=0; i<n; i++) { 892 /* set the pointers */ 893 v = aa + ai[i]; 894 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 895 896 /* first, set the diagonal elements */ 897 AAUp[offset] = 1.0/v[nz]; 898 AALo[offset] = 1.0/v[nz]; 899 900 offset+=1; 901 if (nz>0) { 902 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 903 for (j=offset; j<offset+nz; j++) { 904 AAUp[j] = -AAUp[j]; 905 AALo[j] = AAUp[j]/v[nz]; 906 } 907 offset+=nz; 908 } 909 } 910 if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 911 if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 912 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 913 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 914 ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 915 } 916 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 917 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 918 } catch(char *ex) { 919 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 920 } 921 } 922 PetscFunctionReturn(0); 923 } 924 925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 926 { 927 PetscErrorCode ierr; 928 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 929 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 930 IS ip = a->row; 931 PetscBool perm_identity; 932 PetscInt n = A->rmap->n; 933 934 PetscFunctionBegin; 935 if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 936 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 937 if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 938 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 939 940 A->offloadmask = PETSC_OFFLOAD_BOTH; 941 942 /* lower triangular indices */ 943 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 944 if (!perm_identity) { 945 IS iip; 946 const PetscInt *irip,*rip; 947 948 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 949 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 950 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 951 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 952 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 953 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 954 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 955 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 956 ierr = ISDestroy(&iip);CHKERRQ(ierr); 957 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 958 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 959 } 960 PetscFunctionReturn(0); 961 } 962 963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 964 { 965 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 966 IS ip = b->row; 967 PetscBool perm_identity; 968 PetscErrorCode ierr; 969 970 PetscFunctionBegin; 971 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 972 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 973 B->offloadmask = PETSC_OFFLOAD_CPU; 974 /* determine which version of MatSolve needs to be used. */ 975 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 976 if (perm_identity) { 977 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 978 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 979 B->ops->matsolve = NULL; 980 B->ops->matsolvetranspose = NULL; 981 } else { 982 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 983 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 984 B->ops->matsolve = NULL; 985 B->ops->matsolvetranspose = NULL; 986 } 987 988 /* get the triangular factors */ 989 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 990 PetscFunctionReturn(0); 991 } 992 993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 994 { 995 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 997 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 998 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 999 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1000 cusparseStatus_t stat; 1001 cusparseIndexBase_t indexBase; 1002 cusparseMatrixType_t matrixType; 1003 cusparseFillMode_t fillMode; 1004 cusparseDiagType_t diagType; 1005 cudaError_t cerr; 1006 PetscErrorCode ierr; 1007 1008 PetscFunctionBegin; 1009 /* allocate space for the transpose of the lower triangular factor */ 1010 ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1011 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1012 1013 /* set the matrix descriptors of the lower triangular factor */ 1014 matrixType = cusparseGetMatType(loTriFactor->descr); 1015 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1016 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1017 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1018 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1019 1020 /* Create the matrix description */ 1021 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 1022 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1023 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1024 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1025 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1026 1027 /* set the operation */ 1028 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1029 1030 /* allocate GPU space for the CSC of the lower triangular factor*/ 1031 loTriFactorT->csrMat = new CsrMatrix; 1032 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1033 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1034 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1035 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1036 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1037 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1038 1039 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1041 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1042 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1043 loTriFactor->csrMat->values->data().get(), 1044 loTriFactor->csrMat->row_offsets->data().get(), 1045 loTriFactor->csrMat->column_indices->data().get(), 1046 loTriFactorT->csrMat->values->data().get(), 1047 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1048 CUSPARSE_ACTION_NUMERIC,indexBase, 1049 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1050 cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1051 #endif 1052 1053 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1054 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1055 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1056 loTriFactor->csrMat->values->data().get(), 1057 loTriFactor->csrMat->row_offsets->data().get(), 1058 loTriFactor->csrMat->column_indices->data().get(), 1059 loTriFactorT->csrMat->values->data().get(), 1060 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1061 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1062 CUSPARSE_ACTION_NUMERIC, indexBase, 1063 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1064 #else 1065 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1066 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1067 #endif 1068 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1069 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1070 1071 /* Create the solve analysis information */ 1072 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1073 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1075 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1076 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1077 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1078 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1079 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1080 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1081 #endif 1082 1083 /* perform the solve analysis */ 1084 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1085 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1086 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1087 loTriFactorT->csrMat->column_indices->data().get(), 1088 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1089 loTriFactorT->solveInfo, 1090 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1091 #else 1092 loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1093 #endif 1094 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1095 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1096 1097 /* assign the pointer */ 1098 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1099 1100 /*********************************************/ 1101 /* Now the Transpose of the Upper Tri Factor */ 1102 /*********************************************/ 1103 1104 /* allocate space for the transpose of the upper triangular factor */ 1105 ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1106 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1107 1108 /* set the matrix descriptors of the upper triangular factor */ 1109 matrixType = cusparseGetMatType(upTriFactor->descr); 1110 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1111 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1112 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1113 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1114 1115 /* Create the matrix description */ 1116 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1117 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1118 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1119 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1120 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1121 1122 /* set the operation */ 1123 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1124 1125 /* allocate GPU space for the CSC of the upper triangular factor*/ 1126 upTriFactorT->csrMat = new CsrMatrix; 1127 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1128 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1129 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1130 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1131 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1132 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1133 1134 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1136 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1137 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1138 upTriFactor->csrMat->values->data().get(), 1139 upTriFactor->csrMat->row_offsets->data().get(), 1140 upTriFactor->csrMat->column_indices->data().get(), 1141 upTriFactorT->csrMat->values->data().get(), 1142 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1143 CUSPARSE_ACTION_NUMERIC,indexBase, 1144 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1145 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1146 #endif 1147 1148 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1149 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1150 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1151 upTriFactor->csrMat->values->data().get(), 1152 upTriFactor->csrMat->row_offsets->data().get(), 1153 upTriFactor->csrMat->column_indices->data().get(), 1154 upTriFactorT->csrMat->values->data().get(), 1155 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1156 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1157 CUSPARSE_ACTION_NUMERIC, indexBase, 1158 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat); 1159 #else 1160 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1161 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1162 #endif 1163 1164 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1165 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1166 1167 /* Create the solve analysis information */ 1168 ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1169 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1170 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1171 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1172 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1173 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1175 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1176 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1177 #endif 1178 1179 /* perform the solve analysis */ 1180 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1181 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1182 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1183 upTriFactorT->csrMat->column_indices->data().get(), 1184 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1185 upTriFactorT->solveInfo, 1186 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1187 #else 1188 upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1189 #endif 1190 1191 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1192 ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1193 1194 /* assign the pointer */ 1195 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1196 PetscFunctionReturn(0); 1197 } 1198 1199 struct PetscScalarToPetscInt 1200 { 1201 __host__ __device__ 1202 PetscInt operator()(PetscScalar s) 1203 { 1204 return (PetscInt)PetscRealPart(s); 1205 } 1206 }; 1207 1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1209 { 1210 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1211 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1213 cusparseStatus_t stat; 1214 cusparseIndexBase_t indexBase; 1215 cudaError_t err; 1216 PetscErrorCode ierr; 1217 1218 PetscFunctionBegin; 1219 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1220 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1221 if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1222 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1223 if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 1224 if (A->transupdated) PetscFunctionReturn(0); 1225 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1226 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1227 if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 1228 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1231 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1232 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1233 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1234 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1235 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1236 1237 /* set alpha and beta */ 1238 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1239 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1240 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1241 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1242 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1243 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1244 1245 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1246 CsrMatrix *matrixT = new CsrMatrix; 1247 matstructT->mat = matrixT; 1248 matrixT->num_rows = A->cmap->n; 1249 matrixT->num_cols = A->rmap->n; 1250 matrixT->num_entries = a->nz; 1251 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1252 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1253 matrixT->values = new THRUSTARRAY(a->nz); 1254 1255 if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 1256 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1257 1258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259 #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1260 stat = cusparseCreateCsr(&matstructT->matDescr, 1261 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1262 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1263 matrixT->values->data().get(), 1264 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1265 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1266 #else 1267 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1268 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1269 1270 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1271 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1272 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1273 */ 1274 if (matrixT->num_entries) { 1275 stat = cusparseCreateCsr(&matstructT->matDescr, 1276 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1277 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1278 matrixT->values->data().get(), 1279 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 1280 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1281 1282 } else { 1283 matstructT->matDescr = NULL; 1284 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1285 } 1286 #endif 1287 #endif 1288 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1289 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1290 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1291 #else 1292 CsrMatrix *temp = new CsrMatrix; 1293 CsrMatrix *tempT = new CsrMatrix; 1294 /* First convert HYB to CSR */ 1295 temp->num_rows = A->rmap->n; 1296 temp->num_cols = A->cmap->n; 1297 temp->num_entries = a->nz; 1298 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1299 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1300 temp->values = new THRUSTARRAY(a->nz); 1301 1302 stat = cusparse_hyb2csr(cusparsestruct->handle, 1303 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1304 temp->values->data().get(), 1305 temp->row_offsets->data().get(), 1306 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1307 1308 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1309 tempT->num_rows = A->rmap->n; 1310 tempT->num_cols = A->cmap->n; 1311 tempT->num_entries = a->nz; 1312 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1313 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1314 tempT->values = new THRUSTARRAY(a->nz); 1315 1316 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1317 temp->num_cols, temp->num_entries, 1318 temp->values->data().get(), 1319 temp->row_offsets->data().get(), 1320 temp->column_indices->data().get(), 1321 tempT->values->data().get(), 1322 tempT->column_indices->data().get(), 1323 tempT->row_offsets->data().get(), 1324 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1325 1326 /* Last, convert CSC to HYB */ 1327 cusparseHybMat_t hybMat; 1328 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1329 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1330 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1331 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1332 matstructT->descr, tempT->values->data().get(), 1333 tempT->row_offsets->data().get(), 1334 tempT->column_indices->data().get(), 1335 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1336 1337 /* assign the pointer */ 1338 matstructT->mat = hybMat; 1339 A->transupdated = PETSC_TRUE; 1340 /* delete temporaries */ 1341 if (tempT) { 1342 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1343 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1344 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1345 delete (CsrMatrix*) tempT; 1346 } 1347 if (temp) { 1348 if (temp->values) delete (THRUSTARRAY*) temp->values; 1349 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1350 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1351 delete (CsrMatrix*) temp; 1352 } 1353 #endif 1354 } 1355 } 1356 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1357 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1358 CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 1359 if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 1360 if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 1361 if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 1362 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 1363 if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 1364 if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 1365 if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 1366 if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1367 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1368 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1369 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 1370 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 1371 } 1372 if (!cusparsestruct->csr2csc_i) { 1373 THRUSTARRAY csr2csc_a(matrix->num_entries); 1374 PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1375 1376 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1377 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1378 void *csr2cscBuffer; 1379 size_t csr2cscBufferSize; 1380 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1381 A->cmap->n, matrix->num_entries, 1382 matrix->values->data().get(), 1383 cusparsestruct->rowoffsets_gpu->data().get(), 1384 matrix->column_indices->data().get(), 1385 matrixT->values->data().get(), 1386 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1387 CUSPARSE_ACTION_NUMERIC,indexBase, 1388 cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat); 1389 err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err); 1390 #endif 1391 1392 if (matrix->num_entries) { 1393 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1394 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1395 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1396 1397 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1398 should be filled with indexBase. So I just take a shortcut here. 1399 */ 1400 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1401 A->cmap->n,matrix->num_entries, 1402 csr2csc_a.data().get(), 1403 cusparsestruct->rowoffsets_gpu->data().get(), 1404 matrix->column_indices->data().get(), 1405 matrixT->values->data().get(), 1406 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1407 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1408 CUSPARSE_ACTION_NUMERIC,indexBase, 1409 cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat); 1410 #else 1411 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1412 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1413 #endif 1414 } else { 1415 matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 1416 } 1417 1418 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1419 PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1420 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1421 err = cudaFree(csr2cscBuffer);CHKERRCUDA(err); 1422 #endif 1423 } 1424 PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1425 thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1426 matrixT->values->begin())); 1427 } 1428 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1429 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1430 /* the compressed row indices is not used for matTranspose */ 1431 matstructT->cprowIndices = NULL; 1432 /* assign the pointer */ 1433 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1434 A->transupdated = PETSC_TRUE; 1435 PetscFunctionReturn(0); 1436 } 1437 1438 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1439 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1440 { 1441 PetscInt n = xx->map->n; 1442 const PetscScalar *barray; 1443 PetscScalar *xarray; 1444 thrust::device_ptr<const PetscScalar> bGPU; 1445 thrust::device_ptr<PetscScalar> xGPU; 1446 cusparseStatus_t stat; 1447 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1448 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1449 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1450 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1451 PetscErrorCode ierr; 1452 1453 PetscFunctionBegin; 1454 /* Analyze the matrix and create the transpose ... on the fly */ 1455 if (!loTriFactorT && !upTriFactorT) { 1456 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1457 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1458 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1459 } 1460 1461 /* Get the GPU pointers */ 1462 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1463 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1464 xGPU = thrust::device_pointer_cast(xarray); 1465 bGPU = thrust::device_pointer_cast(barray); 1466 1467 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468 /* First, reorder with the row permutation */ 1469 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1470 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1471 xGPU); 1472 1473 /* First, solve U */ 1474 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1475 upTriFactorT->csrMat->num_rows, 1476 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1477 upTriFactorT->csrMat->num_entries, 1478 #endif 1479 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1480 upTriFactorT->csrMat->values->data().get(), 1481 upTriFactorT->csrMat->row_offsets->data().get(), 1482 upTriFactorT->csrMat->column_indices->data().get(), 1483 upTriFactorT->solveInfo, 1484 xarray, 1485 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1486 tempGPU->data().get(), 1487 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1488 #else 1489 tempGPU->data().get());CHKERRCUSPARSE(stat); 1490 #endif 1491 1492 /* Then, solve L */ 1493 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1494 loTriFactorT->csrMat->num_rows, 1495 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1496 loTriFactorT->csrMat->num_entries, 1497 #endif 1498 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1499 loTriFactorT->csrMat->values->data().get(), 1500 loTriFactorT->csrMat->row_offsets->data().get(), 1501 loTriFactorT->csrMat->column_indices->data().get(), 1502 loTriFactorT->solveInfo, 1503 tempGPU->data().get(), 1504 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1505 xarray, 1506 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1507 #else 1508 xarray);CHKERRCUSPARSE(stat); 1509 #endif 1510 1511 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1512 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1513 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1514 tempGPU->begin()); 1515 1516 /* Copy the temporary to the full solution. */ 1517 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1518 1519 /* restore */ 1520 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1522 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1523 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1524 PetscFunctionReturn(0); 1525 } 1526 1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1528 { 1529 const PetscScalar *barray; 1530 PetscScalar *xarray; 1531 cusparseStatus_t stat; 1532 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1533 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1534 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1535 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1536 PetscErrorCode ierr; 1537 1538 PetscFunctionBegin; 1539 /* Analyze the matrix and create the transpose ... on the fly */ 1540 if (!loTriFactorT && !upTriFactorT) { 1541 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1542 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1543 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1544 } 1545 1546 /* Get the GPU pointers */ 1547 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1548 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1549 1550 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1551 /* First, solve U */ 1552 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1553 upTriFactorT->csrMat->num_rows, 1554 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1555 upTriFactorT->csrMat->num_entries, 1556 #endif 1557 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1558 upTriFactorT->csrMat->values->data().get(), 1559 upTriFactorT->csrMat->row_offsets->data().get(), 1560 upTriFactorT->csrMat->column_indices->data().get(), 1561 upTriFactorT->solveInfo, 1562 barray, 1563 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1564 tempGPU->data().get(), 1565 upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1566 #else 1567 tempGPU->data().get());CHKERRCUSPARSE(stat); 1568 #endif 1569 1570 /* Then, solve L */ 1571 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1572 loTriFactorT->csrMat->num_rows, 1573 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1574 loTriFactorT->csrMat->num_entries, 1575 #endif 1576 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1577 loTriFactorT->csrMat->values->data().get(), 1578 loTriFactorT->csrMat->row_offsets->data().get(), 1579 loTriFactorT->csrMat->column_indices->data().get(), 1580 loTriFactorT->solveInfo, 1581 tempGPU->data().get(), 1582 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1583 xarray, 1584 loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat); 1585 #else 1586 xarray);CHKERRCUSPARSE(stat); 1587 #endif 1588 1589 /* restore */ 1590 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1591 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1592 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1593 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1598 { 1599 const PetscScalar *barray; 1600 PetscScalar *xarray; 1601 thrust::device_ptr<const PetscScalar> bGPU; 1602 thrust::device_ptr<PetscScalar> xGPU; 1603 cusparseStatus_t stat; 1604 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1605 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1606 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1607 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1608 PetscErrorCode ierr; 1609 1610 PetscFunctionBegin; 1611 1612 /* Get the GPU pointers */ 1613 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1614 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1615 xGPU = thrust::device_pointer_cast(xarray); 1616 bGPU = thrust::device_pointer_cast(barray); 1617 1618 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1619 /* First, reorder with the row permutation */ 1620 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1621 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1622 tempGPU->begin()); 1623 1624 /* Next, solve L */ 1625 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1626 loTriFactor->csrMat->num_rows, 1627 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1628 loTriFactor->csrMat->num_entries, 1629 #endif 1630 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1631 loTriFactor->csrMat->values->data().get(), 1632 loTriFactor->csrMat->row_offsets->data().get(), 1633 loTriFactor->csrMat->column_indices->data().get(), 1634 loTriFactor->solveInfo, 1635 tempGPU->data().get(), 1636 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1637 xarray, 1638 loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1639 #else 1640 xarray);CHKERRCUSPARSE(stat); 1641 #endif 1642 1643 /* Then, solve U */ 1644 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1645 upTriFactor->csrMat->num_rows, 1646 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1647 upTriFactor->csrMat->num_entries, 1648 #endif 1649 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1650 upTriFactor->csrMat->values->data().get(), 1651 upTriFactor->csrMat->row_offsets->data().get(), 1652 upTriFactor->csrMat->column_indices->data().get(), 1653 upTriFactor->solveInfo,xarray, 1654 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1655 tempGPU->data().get(), 1656 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1657 #else 1658 tempGPU->data().get());CHKERRCUSPARSE(stat); 1659 #endif 1660 1661 /* Last, reorder with the column permutation */ 1662 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1663 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1664 xGPU); 1665 1666 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1667 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1668 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1669 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1670 PetscFunctionReturn(0); 1671 } 1672 1673 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1674 { 1675 const PetscScalar *barray; 1676 PetscScalar *xarray; 1677 cusparseStatus_t stat; 1678 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1679 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1680 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1681 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1682 PetscErrorCode ierr; 1683 1684 PetscFunctionBegin; 1685 /* Get the GPU pointers */ 1686 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1687 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1688 1689 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1690 /* First, solve L */ 1691 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1692 loTriFactor->csrMat->num_rows, 1693 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1694 loTriFactor->csrMat->num_entries, 1695 #endif 1696 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1697 loTriFactor->csrMat->values->data().get(), 1698 loTriFactor->csrMat->row_offsets->data().get(), 1699 loTriFactor->csrMat->column_indices->data().get(), 1700 loTriFactor->solveInfo, 1701 barray, 1702 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1703 tempGPU->data().get(), 1704 loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1705 #else 1706 tempGPU->data().get());CHKERRCUSPARSE(stat); 1707 #endif 1708 1709 /* Next, solve U */ 1710 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1711 upTriFactor->csrMat->num_rows, 1712 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1713 upTriFactor->csrMat->num_entries, 1714 #endif 1715 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1716 upTriFactor->csrMat->values->data().get(), 1717 upTriFactor->csrMat->row_offsets->data().get(), 1718 upTriFactor->csrMat->column_indices->data().get(), 1719 upTriFactor->solveInfo, 1720 tempGPU->data().get(), 1721 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1722 xarray, 1723 upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat); 1724 #else 1725 xarray);CHKERRCUSPARSE(stat); 1726 #endif 1727 1728 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1729 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1730 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1731 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1732 PetscFunctionReturn(0); 1733 } 1734 1735 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1736 { 1737 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1738 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1739 cudaError_t cerr; 1740 PetscErrorCode ierr; 1741 1742 PetscFunctionBegin; 1743 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1744 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1745 1746 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1747 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1748 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1749 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1750 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1751 A->offloadmask = PETSC_OFFLOAD_BOTH; 1752 } 1753 PetscFunctionReturn(0); 1754 } 1755 1756 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1757 { 1758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1759 PetscErrorCode ierr; 1760 1761 PetscFunctionBegin; 1762 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1763 *array = a->a; 1764 A->offloadmask = PETSC_OFFLOAD_CPU; 1765 PetscFunctionReturn(0); 1766 } 1767 1768 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1769 { 1770 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1771 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1773 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1774 PetscErrorCode ierr; 1775 cusparseStatus_t stat; 1776 PetscBool both = PETSC_TRUE; 1777 cudaError_t err; 1778 1779 PetscFunctionBegin; 1780 if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1781 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1782 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1783 CsrMatrix *matrix; 1784 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1785 1786 if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 1787 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1788 matrix->values->assign(a->a, a->a+a->nz); 1789 err = WaitForCUDA();CHKERRCUDA(err); 1790 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1791 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1792 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 1793 } else { 1794 PetscInt nnz; 1795 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1796 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1797 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 1798 delete cusparsestruct->workVector; 1799 delete cusparsestruct->rowoffsets_gpu; 1800 cusparsestruct->workVector = NULL; 1801 cusparsestruct->rowoffsets_gpu = NULL; 1802 try { 1803 if (a->compressedrow.use) { 1804 m = a->compressedrow.nrows; 1805 ii = a->compressedrow.i; 1806 ridx = a->compressedrow.rindex; 1807 } else { 1808 m = A->rmap->n; 1809 ii = a->i; 1810 ridx = NULL; 1811 } 1812 if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1813 if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 1814 if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1815 else nnz = a->nz; 1816 1817 /* create cusparse matrix */ 1818 cusparsestruct->nrows = m; 1819 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1820 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1821 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1822 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1823 1824 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1825 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1826 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1827 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1828 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1829 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1830 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1831 1832 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1833 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1834 /* set the matrix */ 1835 CsrMatrix *mat= new CsrMatrix; 1836 mat->num_rows = m; 1837 mat->num_cols = A->cmap->n; 1838 mat->num_entries = nnz; 1839 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1840 mat->row_offsets->assign(ii, ii + m+1); 1841 1842 mat->column_indices = new THRUSTINTARRAY32(nnz); 1843 mat->column_indices->assign(a->j, a->j+nnz); 1844 1845 mat->values = new THRUSTARRAY(nnz); 1846 if (a->a) mat->values->assign(a->a, a->a+nnz); 1847 1848 /* assign the pointer */ 1849 matstruct->mat = mat; 1850 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1851 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1852 stat = cusparseCreateCsr(&matstruct->matDescr, 1853 mat->num_rows, mat->num_cols, mat->num_entries, 1854 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1855 mat->values->data().get(), 1856 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1857 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1858 } 1859 #endif 1860 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1861 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1862 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1863 #else 1864 CsrMatrix *mat= new CsrMatrix; 1865 mat->num_rows = m; 1866 mat->num_cols = A->cmap->n; 1867 mat->num_entries = nnz; 1868 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1869 mat->row_offsets->assign(ii, ii + m+1); 1870 1871 mat->column_indices = new THRUSTINTARRAY32(nnz); 1872 mat->column_indices->assign(a->j, a->j+nnz); 1873 1874 mat->values = new THRUSTARRAY(nnz); 1875 if (a->a) mat->values->assign(a->a, a->a+nnz); 1876 1877 cusparseHybMat_t hybMat; 1878 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1879 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1880 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1881 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1882 matstruct->descr, mat->values->data().get(), 1883 mat->row_offsets->data().get(), 1884 mat->column_indices->data().get(), 1885 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1886 /* assign the pointer */ 1887 matstruct->mat = hybMat; 1888 1889 if (mat) { 1890 if (mat->values) delete (THRUSTARRAY*)mat->values; 1891 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1892 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1893 delete (CsrMatrix*)mat; 1894 } 1895 #endif 1896 } 1897 1898 /* assign the compressed row indices */ 1899 if (a->compressedrow.use) { 1900 cusparsestruct->workVector = new THRUSTARRAY(m); 1901 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1902 matstruct->cprowIndices->assign(ridx,ridx+m); 1903 tmp = m; 1904 } else { 1905 cusparsestruct->workVector = NULL; 1906 matstruct->cprowIndices = NULL; 1907 tmp = 0; 1908 } 1909 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1910 1911 /* assign the pointer */ 1912 cusparsestruct->mat = matstruct; 1913 } catch(char *ex) { 1914 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1915 } 1916 err = WaitForCUDA();CHKERRCUDA(err); 1917 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1918 cusparsestruct->nonzerostate = A->nonzerostate; 1919 } 1920 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 1921 } 1922 PetscFunctionReturn(0); 1923 } 1924 1925 struct VecCUDAPlusEquals 1926 { 1927 template <typename Tuple> 1928 __host__ __device__ 1929 void operator()(Tuple t) 1930 { 1931 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1932 } 1933 }; 1934 1935 struct VecCUDAEquals 1936 { 1937 template <typename Tuple> 1938 __host__ __device__ 1939 void operator()(Tuple t) 1940 { 1941 thrust::get<1>(t) = thrust::get<0>(t); 1942 } 1943 }; 1944 1945 struct VecCUDAEqualsReverse 1946 { 1947 template <typename Tuple> 1948 __host__ __device__ 1949 void operator()(Tuple t) 1950 { 1951 thrust::get<0>(t) = thrust::get<1>(t); 1952 } 1953 }; 1954 1955 struct MatMatCusparse { 1956 PetscBool cisdense; 1957 PetscScalar *Bt; 1958 Mat X; 1959 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1960 PetscLogDouble flops; 1961 CsrMatrix *Bcsr; 1962 1963 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1964 cusparseSpMatDescr_t matSpBDescr; 1965 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1966 cusparseDnMatDescr_t matBDescr; 1967 cusparseDnMatDescr_t matCDescr; 1968 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1969 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1970 void *dBuffer4; 1971 void *dBuffer5; 1972 #endif 1973 size_t mmBufferSize; 1974 void *mmBuffer; 1975 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1976 cusparseSpGEMMDescr_t spgemmDesc; 1977 #endif 1978 }; 1979 1980 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1981 { 1982 PetscErrorCode ierr; 1983 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1984 cudaError_t cerr; 1985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1986 cusparseStatus_t stat; 1987 #endif 1988 1989 PetscFunctionBegin; 1990 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1991 delete mmdata->Bcsr; 1992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1993 if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1994 if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1995 if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1996 if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1997 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 1998 if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); } 1999 if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); } 2000 #endif 2001 if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 2002 if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 2003 #endif 2004 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 2005 ierr = PetscFree(data);CHKERRQ(ierr); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2010 2011 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2012 { 2013 Mat_Product *product = C->product; 2014 Mat A,B; 2015 PetscInt m,n,blda,clda; 2016 PetscBool flg,biscuda; 2017 Mat_SeqAIJCUSPARSE *cusp; 2018 cusparseStatus_t stat; 2019 cusparseOperation_t opA; 2020 const PetscScalar *barray; 2021 PetscScalar *carray; 2022 PetscErrorCode ierr; 2023 MatMatCusparse *mmdata; 2024 Mat_SeqAIJCUSPARSEMultStruct *mat; 2025 CsrMatrix *csrmat; 2026 2027 PetscFunctionBegin; 2028 MatCheckProduct(C,1); 2029 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2030 mmdata = (MatMatCusparse*)product->data; 2031 A = product->A; 2032 B = product->B; 2033 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2034 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2035 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2036 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2037 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2038 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2039 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2040 switch (product->type) { 2041 case MATPRODUCT_AB: 2042 case MATPRODUCT_PtAP: 2043 mat = cusp->mat; 2044 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2045 m = A->rmap->n; 2046 n = B->cmap->n; 2047 break; 2048 case MATPRODUCT_AtB: 2049 if (!A->form_explicit_transpose) { 2050 mat = cusp->mat; 2051 opA = CUSPARSE_OPERATION_TRANSPOSE; 2052 } else { 2053 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2054 mat = cusp->matTranspose; 2055 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2056 } 2057 m = A->cmap->n; 2058 n = B->cmap->n; 2059 break; 2060 case MATPRODUCT_ABt: 2061 case MATPRODUCT_RARt: 2062 mat = cusp->mat; 2063 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2064 m = A->rmap->n; 2065 n = B->rmap->n; 2066 break; 2067 default: 2068 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2069 } 2070 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2071 csrmat = (CsrMatrix*)mat->mat; 2072 /* if the user passed a CPU matrix, copy the data to the GPU */ 2073 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 2074 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 2075 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 2076 2077 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 2078 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2079 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2080 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 2081 } else { 2082 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2083 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2084 } 2085 2086 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2087 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2088 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2089 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2090 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2091 size_t mmBufferSize; 2092 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2093 if (!mmdata->matBDescr) { 2094 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2095 mmdata->Blda = blda; 2096 } 2097 2098 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2099 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2100 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2101 mmdata->Clda = clda; 2102 } 2103 2104 if (!mat->matDescr) { 2105 stat = cusparseCreateCsr(&mat->matDescr, 2106 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2107 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2108 csrmat->values->data().get(), 2109 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2110 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2111 } 2112 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2113 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2114 mmdata->matCDescr,cusparse_scalartype, 2115 cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2116 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2117 cudaError_t cerr; 2118 cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2119 cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2120 mmdata->mmBufferSize = mmBufferSize; 2121 } 2122 mmdata->initialized = PETSC_TRUE; 2123 } else { 2124 /* to be safe, always update pointers of the mats */ 2125 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2126 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2127 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2128 } 2129 2130 /* do cusparseSpMM, which supports transpose on B */ 2131 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2132 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2133 mmdata->matCDescr,cusparse_scalartype, 2134 cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2135 #else 2136 PetscInt k; 2137 /* cusparseXcsrmm does not support transpose on B */ 2138 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2139 cublasHandle_t cublasv2handle; 2140 cublasStatus_t cerr; 2141 2142 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2143 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2144 B->cmap->n,B->rmap->n, 2145 &PETSC_CUSPARSE_ONE ,barray,blda, 2146 &PETSC_CUSPARSE_ZERO,barray,blda, 2147 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2148 blda = B->cmap->n; 2149 k = B->cmap->n; 2150 } else { 2151 k = B->rmap->n; 2152 } 2153 2154 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2155 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2156 csrmat->num_entries,mat->alpha_one,mat->descr, 2157 csrmat->values->data().get(), 2158 csrmat->row_offsets->data().get(), 2159 csrmat->column_indices->data().get(), 2160 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2161 carray,clda);CHKERRCUSPARSE(stat); 2162 #endif 2163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2164 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2165 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2166 if (product->type == MATPRODUCT_RARt) { 2167 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2168 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2169 } else if (product->type == MATPRODUCT_PtAP) { 2170 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2171 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2172 } else { 2173 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2174 } 2175 if (mmdata->cisdense) { 2176 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2177 } 2178 if (!biscuda) { 2179 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2180 } 2181 PetscFunctionReturn(0); 2182 } 2183 2184 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2185 { 2186 Mat_Product *product = C->product; 2187 Mat A,B; 2188 PetscInt m,n; 2189 PetscBool cisdense,flg; 2190 PetscErrorCode ierr; 2191 MatMatCusparse *mmdata; 2192 Mat_SeqAIJCUSPARSE *cusp; 2193 2194 PetscFunctionBegin; 2195 MatCheckProduct(C,1); 2196 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2197 A = product->A; 2198 B = product->B; 2199 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2200 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2201 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2202 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2203 switch (product->type) { 2204 case MATPRODUCT_AB: 2205 m = A->rmap->n; 2206 n = B->cmap->n; 2207 break; 2208 case MATPRODUCT_AtB: 2209 m = A->cmap->n; 2210 n = B->cmap->n; 2211 break; 2212 case MATPRODUCT_ABt: 2213 m = A->rmap->n; 2214 n = B->rmap->n; 2215 break; 2216 case MATPRODUCT_PtAP: 2217 m = B->cmap->n; 2218 n = B->cmap->n; 2219 break; 2220 case MATPRODUCT_RARt: 2221 m = B->rmap->n; 2222 n = B->rmap->n; 2223 break; 2224 default: 2225 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2226 } 2227 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2228 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2229 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2230 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2231 2232 /* product data */ 2233 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2234 mmdata->cisdense = cisdense; 2235 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2236 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2237 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2238 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2239 } 2240 #endif 2241 /* for these products we need intermediate storage */ 2242 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2243 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2244 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2245 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2246 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2247 } else { 2248 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2249 } 2250 } 2251 C->product->data = mmdata; 2252 C->product->destroy = MatDestroy_MatMatCusparse; 2253 2254 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2255 PetscFunctionReturn(0); 2256 } 2257 2258 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2259 { 2260 Mat_Product *product = C->product; 2261 Mat A,B; 2262 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2263 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2264 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2265 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2266 PetscBool flg; 2267 PetscErrorCode ierr; 2268 cusparseStatus_t stat; 2269 cudaError_t cerr; 2270 MatProductType ptype; 2271 MatMatCusparse *mmdata; 2272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2273 cusparseSpMatDescr_t BmatSpDescr; 2274 #endif 2275 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2276 2277 PetscFunctionBegin; 2278 MatCheckProduct(C,1); 2279 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2280 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2281 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2282 mmdata = (MatMatCusparse*)C->product->data; 2283 A = product->A; 2284 B = product->B; 2285 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2286 mmdata->reusesym = PETSC_FALSE; 2287 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2288 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2289 Cmat = Ccusp->mat; 2290 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2291 Ccsr = (CsrMatrix*)Cmat->mat; 2292 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2293 goto finalize; 2294 } 2295 if (!c->nz) goto finalize; 2296 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2297 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2298 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2299 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2300 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2301 if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2302 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2303 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2304 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2305 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2306 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2307 if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2308 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2309 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2310 2311 ptype = product->type; 2312 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2313 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2314 switch (ptype) { 2315 case MATPRODUCT_AB: 2316 Amat = Acusp->mat; 2317 Bmat = Bcusp->mat; 2318 break; 2319 case MATPRODUCT_AtB: 2320 Amat = Acusp->matTranspose; 2321 Bmat = Bcusp->mat; 2322 break; 2323 case MATPRODUCT_ABt: 2324 Amat = Acusp->mat; 2325 Bmat = Bcusp->matTranspose; 2326 break; 2327 default: 2328 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2329 } 2330 Cmat = Ccusp->mat; 2331 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2332 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2333 if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2334 Acsr = (CsrMatrix*)Amat->mat; 2335 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2336 Ccsr = (CsrMatrix*)Cmat->mat; 2337 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2338 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2339 if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2340 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2342 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2343 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2344 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2345 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2346 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2347 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2348 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2349 #else 2350 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2351 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2352 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2353 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2354 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2355 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2356 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2357 #endif 2358 #else 2359 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2360 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2361 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2362 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2363 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2364 #endif 2365 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2366 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2367 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2368 C->offloadmask = PETSC_OFFLOAD_GPU; 2369 finalize: 2370 /* shorter version of MatAssemblyEnd_SeqAIJ */ 2371 ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2372 ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2373 ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2374 c->reallocs = 0; 2375 C->info.mallocs += 0; 2376 C->info.nz_unneeded = 0; 2377 C->assembled = C->was_assembled = PETSC_TRUE; 2378 C->num_ass++; 2379 PetscFunctionReturn(0); 2380 } 2381 2382 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2383 { 2384 Mat_Product *product = C->product; 2385 Mat A,B; 2386 Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2387 Mat_SeqAIJ *a,*b,*c; 2388 Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2389 CsrMatrix *Acsr,*Bcsr,*Ccsr; 2390 PetscInt i,j,m,n,k; 2391 PetscBool flg; 2392 PetscErrorCode ierr; 2393 cusparseStatus_t stat; 2394 cudaError_t cerr; 2395 MatProductType ptype; 2396 MatMatCusparse *mmdata; 2397 PetscLogDouble flops; 2398 PetscBool biscompressed,ciscompressed; 2399 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2400 int64_t C_num_rows1, C_num_cols1, C_nnz1; 2401 cusparseSpMatDescr_t BmatSpDescr; 2402 #else 2403 int cnz; 2404 #endif 2405 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2406 2407 PetscFunctionBegin; 2408 MatCheckProduct(C,1); 2409 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2410 A = product->A; 2411 B = product->B; 2412 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2413 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2414 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2415 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2416 a = (Mat_SeqAIJ*)A->data; 2417 b = (Mat_SeqAIJ*)B->data; 2418 /* product data */ 2419 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2420 C->product->data = mmdata; 2421 C->product->destroy = MatDestroy_MatMatCusparse; 2422 2423 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2424 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2425 Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2426 Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2427 if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2428 if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2429 2430 ptype = product->type; 2431 if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2432 if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2433 biscompressed = PETSC_FALSE; 2434 ciscompressed = PETSC_FALSE; 2435 switch (ptype) { 2436 case MATPRODUCT_AB: 2437 m = A->rmap->n; 2438 n = B->cmap->n; 2439 k = A->cmap->n; 2440 Amat = Acusp->mat; 2441 Bmat = Bcusp->mat; 2442 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2443 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2444 break; 2445 case MATPRODUCT_AtB: 2446 m = A->cmap->n; 2447 n = B->cmap->n; 2448 k = A->rmap->n; 2449 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 2450 Amat = Acusp->matTranspose; 2451 Bmat = Bcusp->mat; 2452 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2453 break; 2454 case MATPRODUCT_ABt: 2455 m = A->rmap->n; 2456 n = B->rmap->n; 2457 k = A->cmap->n; 2458 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 2459 Amat = Acusp->mat; 2460 Bmat = Bcusp->matTranspose; 2461 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2462 break; 2463 default: 2464 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2465 } 2466 2467 /* create cusparse matrix */ 2468 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2469 ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2470 c = (Mat_SeqAIJ*)C->data; 2471 Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2472 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2473 Ccsr = new CsrMatrix; 2474 2475 c->compressedrow.use = ciscompressed; 2476 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2477 c->compressedrow.nrows = a->compressedrow.nrows; 2478 ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2479 ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2480 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2481 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2482 Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2483 } else { 2484 c->compressedrow.nrows = 0; 2485 c->compressedrow.i = NULL; 2486 c->compressedrow.rindex = NULL; 2487 Ccusp->workVector = NULL; 2488 Cmat->cprowIndices = NULL; 2489 } 2490 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2491 Ccusp->mat = Cmat; 2492 Ccusp->mat->mat = Ccsr; 2493 Ccsr->num_rows = Ccusp->nrows; 2494 Ccsr->num_cols = n; 2495 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2496 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2497 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2498 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2499 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2500 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2501 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2502 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2503 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2504 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2505 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2506 thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2507 c->nz = 0; 2508 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2509 Ccsr->values = new THRUSTARRAY(c->nz); 2510 goto finalizesym; 2511 } 2512 2513 if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2514 if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2515 Acsr = (CsrMatrix*)Amat->mat; 2516 if (!biscompressed) { 2517 Bcsr = (CsrMatrix*)Bmat->mat; 2518 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2519 BmatSpDescr = Bmat->matDescr; 2520 #endif 2521 } else { /* we need to use row offsets for the full matrix */ 2522 CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2523 Bcsr = new CsrMatrix; 2524 Bcsr->num_rows = B->rmap->n; 2525 Bcsr->num_cols = cBcsr->num_cols; 2526 Bcsr->num_entries = cBcsr->num_entries; 2527 Bcsr->column_indices = cBcsr->column_indices; 2528 Bcsr->values = cBcsr->values; 2529 if (!Bcusp->rowoffsets_gpu) { 2530 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2531 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2532 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2533 } 2534 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2535 mmdata->Bcsr = Bcsr; 2536 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2537 if (Bcsr->num_rows && Bcsr->num_cols) { 2538 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2539 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2540 Bcsr->values->data().get(), 2541 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2542 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2543 } 2544 BmatSpDescr = mmdata->matSpBDescr; 2545 #endif 2546 } 2547 if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 2548 if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2549 /* precompute flops count */ 2550 if (ptype == MATPRODUCT_AB) { 2551 for (i=0, flops = 0; i<A->rmap->n; i++) { 2552 const PetscInt st = a->i[i]; 2553 const PetscInt en = a->i[i+1]; 2554 for (j=st; j<en; j++) { 2555 const PetscInt brow = a->j[j]; 2556 flops += 2.*(b->i[brow+1] - b->i[brow]); 2557 } 2558 } 2559 } else if (ptype == MATPRODUCT_AtB) { 2560 for (i=0, flops = 0; i<A->rmap->n; i++) { 2561 const PetscInt anzi = a->i[i+1] - a->i[i]; 2562 const PetscInt bnzi = b->i[i+1] - b->i[i]; 2563 flops += (2.*anzi)*bnzi; 2564 } 2565 } else { /* TODO */ 2566 flops = 0.; 2567 } 2568 2569 mmdata->flops = flops; 2570 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2571 2572 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2573 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2574 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2575 NULL, NULL, NULL, 2576 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2577 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2578 stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2579 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2580 { 2581 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2582 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2583 */ 2584 void* dBuffer1 = NULL; 2585 void* dBuffer2 = NULL; 2586 void* dBuffer3 = NULL; 2587 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2588 size_t bufferSize1 = 0; 2589 size_t bufferSize2 = 0; 2590 size_t bufferSize3 = 0; 2591 size_t bufferSize4 = 0; 2592 size_t bufferSize5 = 0; 2593 2594 /*----------------------------------------------------------------------*/ 2595 /* ask bufferSize1 bytes for external memory */ 2596 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2597 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2598 &bufferSize1, NULL);CHKERRCUSPARSE(stat); 2599 cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr); 2600 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2601 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2602 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2603 &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat); 2604 2605 /*----------------------------------------------------------------------*/ 2606 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2607 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2608 &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat); 2609 cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr); 2610 cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr); 2611 cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr); 2612 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2613 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2614 &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat); 2615 cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr); 2616 cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr); 2617 2618 /*----------------------------------------------------------------------*/ 2619 /* get matrix C non-zero entries C_nnz1 */ 2620 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2621 c->nz = (PetscInt) C_nnz1; 2622 /* allocate matrix C */ 2623 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2624 Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2625 /* update matC with the new pointers */ 2626 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2627 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2628 2629 /*----------------------------------------------------------------------*/ 2630 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2631 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2632 &bufferSize5, NULL);CHKERRCUSPARSE(stat); 2633 cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr); 2634 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2635 CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 2636 &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat); 2637 cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr); 2638 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2639 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2640 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2641 mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2642 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr); 2643 } 2644 #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2645 size_t bufSize2; 2646 /* ask bufferSize bytes for external memory */ 2647 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2648 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2649 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2650 mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2651 cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr); 2652 /* inspect the matrices A and B to understand the memory requirement for the next step */ 2653 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2654 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2655 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2656 mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2657 /* ask bufferSize again bytes for external memory */ 2658 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2659 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2660 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2661 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2662 /* The CUSPARSE documentation is not clear, nor the API 2663 We need both buffers to perform the operations properly! 2664 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2665 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2666 is stored in the descriptor! What a messy API... */ 2667 cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr); 2668 /* compute the intermediate product of A * B */ 2669 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2670 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2671 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2672 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2673 /* get matrix C non-zero entries C_nnz1 */ 2674 stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2675 c->nz = (PetscInt) C_nnz1; 2676 ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr); 2677 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2678 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2679 Ccsr->values = new THRUSTARRAY(c->nz); 2680 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2681 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2682 Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2683 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2684 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2685 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2686 #endif 2687 #else 2688 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2689 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2690 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2691 Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2692 Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2693 Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2694 c->nz = cnz; 2695 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2696 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2697 Ccsr->values = new THRUSTARRAY(c->nz); 2698 CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2699 2700 stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2701 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2702 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2703 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2704 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2705 Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2706 Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2707 Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2708 Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2709 #endif 2710 ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2711 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2712 finalizesym: 2713 c->singlemalloc = PETSC_FALSE; 2714 c->free_a = PETSC_TRUE; 2715 c->free_ij = PETSC_TRUE; 2716 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2717 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2718 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2719 PetscInt *d_i = c->i; 2720 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2721 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2722 ii = *Ccsr->row_offsets; 2723 jj = *Ccsr->column_indices; 2724 if (ciscompressed) d_i = c->compressedrow.i; 2725 cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2726 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2727 } else { 2728 PetscInt *d_i = c->i; 2729 if (ciscompressed) d_i = c->compressedrow.i; 2730 cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2731 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2732 } 2733 if (ciscompressed) { /* need to expand host row offsets */ 2734 PetscInt r = 0; 2735 c->i[0] = 0; 2736 for (k = 0; k < c->compressedrow.nrows; k++) { 2737 const PetscInt next = c->compressedrow.rindex[k]; 2738 const PetscInt old = c->compressedrow.i[k]; 2739 for (; r < next; r++) c->i[r+1] = old; 2740 } 2741 for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2742 } 2743 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2744 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2745 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2746 c->maxnz = c->nz; 2747 c->nonzerorowcnt = 0; 2748 c->rmax = 0; 2749 for (k = 0; k < m; k++) { 2750 const PetscInt nn = c->i[k+1] - c->i[k]; 2751 c->ilen[k] = c->imax[k] = nn; 2752 c->nonzerorowcnt += (PetscInt)!!nn; 2753 c->rmax = PetscMax(c->rmax,nn); 2754 } 2755 ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2756 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2757 Ccsr->num_entries = c->nz; 2758 2759 C->nonzerostate++; 2760 ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2761 ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2762 Ccusp->nonzerostate = C->nonzerostate; 2763 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2764 C->preallocated = PETSC_TRUE; 2765 C->assembled = PETSC_FALSE; 2766 C->was_assembled = PETSC_FALSE; 2767 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2768 mmdata->reusesym = PETSC_TRUE; 2769 C->offloadmask = PETSC_OFFLOAD_GPU; 2770 } 2771 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2772 PetscFunctionReturn(0); 2773 } 2774 2775 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2776 2777 /* handles sparse or dense B */ 2778 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2779 { 2780 Mat_Product *product = mat->product; 2781 PetscErrorCode ierr; 2782 PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2783 2784 PetscFunctionBegin; 2785 MatCheckProduct(mat,1); 2786 ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2787 if (!product->A->boundtocpu && !product->B->boundtocpu) { 2788 ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2789 } 2790 if (product->type == MATPRODUCT_ABC) { 2791 Ciscusp = PETSC_FALSE; 2792 if (!product->C->boundtocpu) { 2793 ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2794 } 2795 } 2796 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 2797 PetscBool usecpu = PETSC_FALSE; 2798 switch (product->type) { 2799 case MATPRODUCT_AB: 2800 if (product->api_user) { 2801 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr); 2802 ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2803 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2804 } else { 2805 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr); 2806 ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2807 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2808 } 2809 break; 2810 case MATPRODUCT_AtB: 2811 if (product->api_user) { 2812 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr); 2813 ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2814 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2815 } else { 2816 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr); 2817 ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2818 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2819 } 2820 break; 2821 case MATPRODUCT_PtAP: 2822 if (product->api_user) { 2823 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr); 2824 ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2825 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2826 } else { 2827 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr); 2828 ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2829 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2830 } 2831 break; 2832 case MATPRODUCT_RARt: 2833 if (product->api_user) { 2834 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr); 2835 ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2836 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2837 } else { 2838 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr); 2839 ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2840 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2841 } 2842 break; 2843 case MATPRODUCT_ABC: 2844 if (product->api_user) { 2845 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr); 2846 ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2847 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2848 } else { 2849 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr); 2850 ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr); 2851 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2852 } 2853 break; 2854 default: 2855 break; 2856 } 2857 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 2858 } 2859 /* dispatch */ 2860 if (isdense) { 2861 switch (product->type) { 2862 case MATPRODUCT_AB: 2863 case MATPRODUCT_AtB: 2864 case MATPRODUCT_ABt: 2865 case MATPRODUCT_PtAP: 2866 case MATPRODUCT_RARt: 2867 if (product->A->boundtocpu) { 2868 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2869 } else { 2870 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2871 } 2872 break; 2873 case MATPRODUCT_ABC: 2874 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2875 break; 2876 default: 2877 break; 2878 } 2879 } else if (Biscusp && Ciscusp) { 2880 switch (product->type) { 2881 case MATPRODUCT_AB: 2882 case MATPRODUCT_AtB: 2883 case MATPRODUCT_ABt: 2884 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2885 break; 2886 case MATPRODUCT_PtAP: 2887 case MATPRODUCT_RARt: 2888 case MATPRODUCT_ABC: 2889 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2890 break; 2891 default: 2892 break; 2893 } 2894 } else { /* fallback for AIJ */ 2895 ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2896 } 2897 PetscFunctionReturn(0); 2898 } 2899 2900 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2901 { 2902 PetscErrorCode ierr; 2903 2904 PetscFunctionBegin; 2905 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2906 PetscFunctionReturn(0); 2907 } 2908 2909 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2910 { 2911 PetscErrorCode ierr; 2912 2913 PetscFunctionBegin; 2914 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2915 PetscFunctionReturn(0); 2916 } 2917 2918 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2919 { 2920 PetscErrorCode ierr; 2921 2922 PetscFunctionBegin; 2923 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2924 PetscFunctionReturn(0); 2925 } 2926 2927 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2928 { 2929 PetscErrorCode ierr; 2930 2931 PetscFunctionBegin; 2932 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2933 PetscFunctionReturn(0); 2934 } 2935 2936 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2937 { 2938 PetscErrorCode ierr; 2939 2940 PetscFunctionBegin; 2941 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2942 PetscFunctionReturn(0); 2943 } 2944 2945 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2946 { 2947 int i = blockIdx.x*blockDim.x + threadIdx.x; 2948 if (i < n) y[idx[i]] += x[i]; 2949 } 2950 2951 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2952 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2953 { 2954 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2955 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2956 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2957 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2958 PetscErrorCode ierr; 2959 cusparseStatus_t stat; 2960 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2961 PetscBool compressed; 2962 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2963 PetscInt nx,ny; 2964 #endif 2965 2966 PetscFunctionBegin; 2967 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2968 if (!a->nonzerorowcnt) { 2969 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2970 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2971 PetscFunctionReturn(0); 2972 } 2973 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2974 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2975 if (!trans) { 2976 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2977 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2978 } else { 2979 if (herm || !A->form_explicit_transpose) { 2980 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2981 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2982 } else { 2983 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);} 2984 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2985 } 2986 } 2987 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2988 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2989 2990 try { 2991 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2992 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2993 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2994 2995 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2996 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2997 /* z = A x + beta y. 2998 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2999 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3000 */ 3001 xptr = xarray; 3002 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3003 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3004 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3005 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3006 allocated to accommodate different uses. So we get the length info directly from mat. 3007 */ 3008 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3009 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3010 nx = mat->num_cols; 3011 ny = mat->num_rows; 3012 } 3013 #endif 3014 } else { 3015 /* z = A^T x + beta y 3016 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3017 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3018 */ 3019 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3020 dptr = zarray; 3021 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3022 if (compressed) { /* Scatter x to work vector */ 3023 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3024 thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3025 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3026 VecCUDAEqualsReverse()); 3027 } 3028 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3029 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3030 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3031 nx = mat->num_rows; 3032 ny = mat->num_cols; 3033 } 3034 #endif 3035 } 3036 3037 /* csr_spmv does y = alpha op(A) x + beta y */ 3038 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3039 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3040 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3041 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3042 cudaError_t cerr; 3043 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3044 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 3045 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3046 matstruct->matDescr, 3047 matstruct->cuSpMV[opA].vecXDescr, beta, 3048 matstruct->cuSpMV[opA].vecYDescr, 3049 cusparse_scalartype, 3050 cusparsestruct->spmvAlg, 3051 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 3052 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 3053 3054 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3055 } else { 3056 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3057 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 3058 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 3059 } 3060 3061 stat = cusparseSpMV(cusparsestruct->handle, opA, 3062 matstruct->alpha_one, 3063 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3064 matstruct->cuSpMV[opA].vecXDescr, 3065 beta, 3066 matstruct->cuSpMV[opA].vecYDescr, 3067 cusparse_scalartype, 3068 cusparsestruct->spmvAlg, 3069 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 3070 #else 3071 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3072 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 3073 mat->num_rows, mat->num_cols, 3074 mat->num_entries, matstruct->alpha_one, matstruct->descr, 3075 mat->values->data().get(), mat->row_offsets->data().get(), 3076 mat->column_indices->data().get(), xptr, beta, 3077 dptr);CHKERRCUSPARSE(stat); 3078 #endif 3079 } else { 3080 if (cusparsestruct->nrows) { 3081 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3082 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3083 #else 3084 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3085 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 3086 matstruct->alpha_one, matstruct->descr, hybMat, 3087 xptr, beta, 3088 dptr);CHKERRCUSPARSE(stat); 3089 #endif 3090 } 3091 } 3092 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3093 3094 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3095 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3096 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3097 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 3098 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3099 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3100 } 3101 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3102 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 3103 } 3104 3105 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3106 if (compressed) { 3107 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3108 /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3109 and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3110 prevent that. So I just add a ScatterAdd kernel. 3111 */ 3112 #if 0 3113 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3114 thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3115 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3116 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3117 VecCUDAPlusEquals()); 3118 #else 3119 PetscInt n = matstruct->cprowIndices->size(); 3120 ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3121 #endif 3122 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3123 } 3124 } else { 3125 if (yy && yy != zz) { 3126 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 3127 } 3128 } 3129 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 3130 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 3131 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 3132 } catch(char *ex) { 3133 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3134 } 3135 if (yy) { 3136 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 3137 } else { 3138 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 3139 } 3140 PetscFunctionReturn(0); 3141 } 3142 3143 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3144 { 3145 PetscErrorCode ierr; 3146 3147 PetscFunctionBegin; 3148 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 3149 PetscFunctionReturn(0); 3150 } 3151 3152 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 3153 { 3154 PetscErrorCode ierr; 3155 PetscObjectState onnz = A->nonzerostate; 3156 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3157 3158 PetscFunctionBegin; 3159 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 3160 if (onnz != A->nonzerostate && cusp->deviceMat) { 3161 cudaError_t cerr; 3162 3163 ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 3164 cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr); 3165 cusp->deviceMat = NULL; 3166 } 3167 PetscFunctionReturn(0); 3168 } 3169 3170 /* --------------------------------------------------------------------------------*/ 3171 /*@ 3172 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3173 (the default parallel PETSc format). This matrix will ultimately pushed down 3174 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3175 assembly performance the user should preallocate the matrix storage by setting 3176 the parameter nz (or the array nnz). By setting these parameters accurately, 3177 performance during matrix assembly can be increased by more than a factor of 50. 3178 3179 Collective 3180 3181 Input Parameters: 3182 + comm - MPI communicator, set to PETSC_COMM_SELF 3183 . m - number of rows 3184 . n - number of columns 3185 . nz - number of nonzeros per row (same for all rows) 3186 - nnz - array containing the number of nonzeros in the various rows 3187 (possibly different for each row) or NULL 3188 3189 Output Parameter: 3190 . A - the matrix 3191 3192 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 3193 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3194 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 3195 3196 Notes: 3197 If nnz is given then nz is ignored 3198 3199 The AIJ format (also called the Yale sparse matrix format or 3200 compressed row storage), is fully compatible with standard Fortran 77 3201 storage. That is, the stored row and column indices can begin at 3202 either one (as in Fortran) or zero. See the users' manual for details. 3203 3204 Specify the preallocated storage with either nz or nnz (not both). 3205 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 3206 allocation. For large problems you MUST preallocate memory or you 3207 will get TERRIBLE performance, see the users' manual chapter on matrices. 3208 3209 By default, this format uses inodes (identical nodes) when possible, to 3210 improve numerical efficiency of matrix-vector products and solves. We 3211 search for consecutive rows with the same nonzero structure, thereby 3212 reusing matrix information to achieve increased efficiency. 3213 3214 Level: intermediate 3215 3216 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 3217 @*/ 3218 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 3219 { 3220 PetscErrorCode ierr; 3221 3222 PetscFunctionBegin; 3223 ierr = MatCreate(comm,A);CHKERRQ(ierr); 3224 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 3225 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3226 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 3227 PetscFunctionReturn(0); 3228 } 3229 3230 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3231 { 3232 PetscErrorCode ierr; 3233 3234 PetscFunctionBegin; 3235 if (A->factortype == MAT_FACTOR_NONE) { 3236 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 3237 } else { 3238 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3239 } 3240 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3241 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3242 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3243 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3244 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3245 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 3246 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3247 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3248 ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 3249 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 3250 PetscFunctionReturn(0); 3251 } 3252 3253 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 3254 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 3255 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 3256 { 3257 PetscErrorCode ierr; 3258 3259 PetscFunctionBegin; 3260 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3261 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 3262 PetscFunctionReturn(0); 3263 } 3264 3265 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 3266 { 3267 PetscErrorCode ierr; 3268 Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3269 Mat_SeqAIJCUSPARSE *cy; 3270 Mat_SeqAIJCUSPARSE *cx; 3271 PetscScalar *ay; 3272 const PetscScalar *ax; 3273 CsrMatrix *csry,*csrx; 3274 3275 PetscFunctionBegin; 3276 cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3277 cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3278 if (X->ops->axpy != Y->ops->axpy) { 3279 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3280 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3281 PetscFunctionReturn(0); 3282 } 3283 /* if we are here, it means both matrices are bound to GPU */ 3284 ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3285 ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3286 if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3287 if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3288 csry = (CsrMatrix*)cy->mat->mat; 3289 csrx = (CsrMatrix*)cx->mat->mat; 3290 /* see if we can turn this into a cublas axpy */ 3291 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3292 bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3293 if (eq) { 3294 eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3295 } 3296 if (eq) str = SAME_NONZERO_PATTERN; 3297 } 3298 /* spgeam is buggy with one column */ 3299 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3300 3301 if (str == SUBSET_NONZERO_PATTERN) { 3302 cusparseStatus_t stat; 3303 PetscScalar b = 1.0; 3304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3305 size_t bufferSize; 3306 void *buffer; 3307 cudaError_t cerr; 3308 #endif 3309 3310 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3311 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3312 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 3313 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3314 stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3315 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3316 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3317 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat); 3318 cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr); 3319 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3320 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3321 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3322 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3323 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat); 3324 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3325 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3326 cerr = cudaFree(buffer);CHKERRCUDA(cerr); 3327 #else 3328 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3329 stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3330 &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3331 &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 3332 cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat); 3333 ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr); 3334 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3335 #endif 3336 stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 3337 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3338 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3339 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3340 } else if (str == SAME_NONZERO_PATTERN) { 3341 cublasHandle_t cublasv2handle; 3342 cublasStatus_t berr; 3343 PetscBLASInt one = 1, bnz = 1; 3344 3345 ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr); 3346 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3347 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3348 ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3349 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3350 berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr); 3351 ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3352 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3353 ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr); 3354 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3355 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3356 } else { 3357 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr); 3358 ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 3359 } 3360 PetscFunctionReturn(0); 3361 } 3362 3363 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 3364 { 3365 PetscErrorCode ierr; 3366 Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 3367 PetscScalar *ay; 3368 cublasHandle_t cublasv2handle; 3369 cublasStatus_t berr; 3370 PetscBLASInt one = 1, bnz = 1; 3371 3372 PetscFunctionBegin; 3373 ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr); 3374 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3375 ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr); 3376 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3377 berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr); 3378 ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr); 3379 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3380 ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr); 3381 ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3382 PetscFunctionReturn(0); 3383 } 3384 3385 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3386 { 3387 PetscErrorCode ierr; 3388 PetscBool both = PETSC_FALSE; 3389 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3390 3391 PetscFunctionBegin; 3392 if (A->factortype == MAT_FACTOR_NONE) { 3393 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 3394 if (spptr->mat) { 3395 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 3396 if (matrix->values) { 3397 both = PETSC_TRUE; 3398 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3399 } 3400 } 3401 if (spptr->matTranspose) { 3402 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 3403 if (matrix->values) { 3404 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3405 } 3406 } 3407 } 3408 //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3409 ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3410 ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 3411 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3412 else A->offloadmask = PETSC_OFFLOAD_CPU; 3413 PetscFunctionReturn(0); 3414 } 3415 3416 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3417 { 3418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3419 PetscErrorCode ierr; 3420 3421 PetscFunctionBegin; 3422 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3423 if (flg) { 3424 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3425 3426 A->ops->scale = MatScale_SeqAIJ; 3427 A->ops->axpy = MatAXPY_SeqAIJ; 3428 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3429 A->ops->mult = MatMult_SeqAIJ; 3430 A->ops->multadd = MatMultAdd_SeqAIJ; 3431 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3432 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3433 A->ops->multhermitiantranspose = NULL; 3434 A->ops->multhermitiantransposeadd = NULL; 3435 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr); 3437 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3438 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3439 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3440 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3441 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3442 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3443 } else { 3444 A->ops->scale = MatScale_SeqAIJCUSPARSE; 3445 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3446 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3447 A->ops->mult = MatMult_SeqAIJCUSPARSE; 3448 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3449 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3450 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3451 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3452 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3453 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3454 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3455 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3456 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3457 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3458 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3459 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3460 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3461 } 3462 A->boundtocpu = flg; 3463 a->inode.use = flg; 3464 PetscFunctionReturn(0); 3465 } 3466 3467 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 3468 { 3469 PetscErrorCode ierr; 3470 cusparseStatus_t stat; 3471 Mat B; 3472 3473 PetscFunctionBegin; 3474 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 3475 if (reuse == MAT_INITIAL_MATRIX) { 3476 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 3477 } else if (reuse == MAT_REUSE_MATRIX) { 3478 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3479 } 3480 B = *newmat; 3481 3482 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 3483 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 3484 3485 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 3486 if (B->factortype == MAT_FACTOR_NONE) { 3487 Mat_SeqAIJCUSPARSE *spptr; 3488 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3489 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3490 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3491 spptr->format = MAT_CUSPARSE_CSR; 3492 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3493 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3494 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3495 #else 3496 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3497 #endif 3498 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3499 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3500 #endif 3501 B->spptr = spptr; 3502 } else { 3503 Mat_SeqAIJCUSPARSETriFactors *spptr; 3504 3505 ierr = PetscNew(&spptr);CHKERRQ(ierr); 3506 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3507 stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 3508 B->spptr = spptr; 3509 } 3510 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3511 } 3512 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 3513 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 3514 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 3515 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 3516 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3517 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 3518 3519 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 3520 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3521 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 3522 #if defined(PETSC_HAVE_HYPRE) 3523 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 3524 #endif 3525 PetscFunctionReturn(0); 3526 } 3527 3528 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3529 { 3530 PetscErrorCode ierr; 3531 3532 PetscFunctionBegin; 3533 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 3534 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3535 PetscFunctionReturn(0); 3536 } 3537 3538 /*MC 3539 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3540 3541 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3542 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3543 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3544 3545 Options Database Keys: 3546 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3547 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3548 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3549 3550 Level: beginner 3551 3552 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3553 M*/ 3554 3555 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 3556 3557 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3558 { 3559 PetscErrorCode ierr; 3560 3561 PetscFunctionBegin; 3562 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr); 3563 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3564 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3565 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3566 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 3567 3568 PetscFunctionReturn(0); 3569 } 3570 3571 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3572 { 3573 PetscErrorCode ierr; 3574 cusparseStatus_t stat; 3575 3576 PetscFunctionBegin; 3577 if (*cusparsestruct) { 3578 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3579 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 3580 delete (*cusparsestruct)->workVector; 3581 delete (*cusparsestruct)->rowoffsets_gpu; 3582 delete (*cusparsestruct)->cooPerm; 3583 delete (*cusparsestruct)->cooPerm_a; 3584 delete (*cusparsestruct)->csr2csc_i; 3585 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3586 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 3587 } 3588 PetscFunctionReturn(0); 3589 } 3590 3591 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3592 { 3593 PetscFunctionBegin; 3594 if (*mat) { 3595 delete (*mat)->values; 3596 delete (*mat)->column_indices; 3597 delete (*mat)->row_offsets; 3598 delete *mat; 3599 *mat = 0; 3600 } 3601 PetscFunctionReturn(0); 3602 } 3603 3604 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3605 { 3606 cusparseStatus_t stat; 3607 PetscErrorCode ierr; 3608 3609 PetscFunctionBegin; 3610 if (*trifactor) { 3611 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3612 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 3613 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 3614 if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 3615 if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3616 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3617 if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3618 #endif 3619 ierr = PetscFree(*trifactor);CHKERRQ(ierr); 3620 } 3621 PetscFunctionReturn(0); 3622 } 3623 3624 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 3625 { 3626 CsrMatrix *mat; 3627 cusparseStatus_t stat; 3628 cudaError_t err; 3629 3630 PetscFunctionBegin; 3631 if (*matstruct) { 3632 if ((*matstruct)->mat) { 3633 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3634 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3635 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3636 #else 3637 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 3638 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3639 #endif 3640 } else { 3641 mat = (CsrMatrix*)(*matstruct)->mat; 3642 CsrMatrix_Destroy(&mat); 3643 } 3644 } 3645 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 3646 delete (*matstruct)->cprowIndices; 3647 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 3648 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 3649 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3650 3651 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3652 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3653 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3654 for (int i=0; i<3; i++) { 3655 if (mdata->cuSpMV[i].initialized) { 3656 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3657 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3658 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3659 } 3660 } 3661 #endif 3662 delete *matstruct; 3663 *matstruct = NULL; 3664 } 3665 PetscFunctionReturn(0); 3666 } 3667 3668 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 3669 { 3670 PetscErrorCode ierr; 3671 3672 PetscFunctionBegin; 3673 if (*trifactors) { 3674 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3675 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3676 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3677 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 3678 delete (*trifactors)->rpermIndices; 3679 delete (*trifactors)->cpermIndices; 3680 delete (*trifactors)->workVector; 3681 (*trifactors)->rpermIndices = NULL; 3682 (*trifactors)->cpermIndices = NULL; 3683 (*trifactors)->workVector = NULL; 3684 if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);} 3685 if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);} 3686 (*trifactors)->init_dev_prop = PETSC_FALSE; 3687 } 3688 PetscFunctionReturn(0); 3689 } 3690 3691 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3692 { 3693 PetscErrorCode ierr; 3694 cusparseHandle_t handle; 3695 cusparseStatus_t stat; 3696 3697 PetscFunctionBegin; 3698 if (*trifactors) { 3699 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 3700 if (handle = (*trifactors)->handle) { 3701 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 3702 } 3703 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 3704 } 3705 PetscFunctionReturn(0); 3706 } 3707 3708 struct IJCompare 3709 { 3710 __host__ __device__ 3711 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3712 { 3713 if (t1.get<0>() < t2.get<0>()) return true; 3714 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3715 return false; 3716 } 3717 }; 3718 3719 struct IJEqual 3720 { 3721 __host__ __device__ 3722 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 3723 { 3724 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 3725 return true; 3726 } 3727 }; 3728 3729 struct IJDiff 3730 { 3731 __host__ __device__ 3732 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3733 { 3734 return t1 == t2 ? 0 : 1; 3735 } 3736 }; 3737 3738 struct IJSum 3739 { 3740 __host__ __device__ 3741 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 3742 { 3743 return t1||t2; 3744 } 3745 }; 3746 3747 #include <thrust/iterator/discard_iterator.h> 3748 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 3749 { 3750 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3751 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3752 THRUSTARRAY *cooPerm_v = NULL; 3753 thrust::device_ptr<const PetscScalar> d_v; 3754 CsrMatrix *matrix; 3755 PetscErrorCode ierr; 3756 PetscInt n; 3757 3758 PetscFunctionBegin; 3759 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 3760 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 3761 if (!cusp->cooPerm) { 3762 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3763 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3764 PetscFunctionReturn(0); 3765 } 3766 matrix = (CsrMatrix*)cusp->mat->mat; 3767 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3768 if (!v) { 3769 if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3770 goto finalize; 3771 } 3772 n = cusp->cooPerm->size(); 3773 if (isCudaMem(v)) { 3774 d_v = thrust::device_pointer_cast(v); 3775 } else { 3776 cooPerm_v = new THRUSTARRAY(n); 3777 cooPerm_v->assign(v,v+n); 3778 d_v = cooPerm_v->data(); 3779 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3780 } 3781 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3782 if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3783 if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3784 THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 3785 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3786 /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3787 cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3788 cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3789 */ 3790 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3791 thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3792 delete cooPerm_w; 3793 } else { 3794 /* all nonzeros in d_v[] are unique entries */ 3795 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3796 matrix->values->begin())); 3797 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3798 matrix->values->end())); 3799 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 3800 } 3801 } else { 3802 if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 3803 auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3804 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3805 } else { 3806 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 3807 matrix->values->begin())); 3808 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 3809 matrix->values->end())); 3810 thrust::for_each(zibit,zieit,VecCUDAEquals()); 3811 } 3812 } 3813 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3814 finalize: 3815 delete cooPerm_v; 3816 A->offloadmask = PETSC_OFFLOAD_GPU; 3817 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3818 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3819 ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3820 ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3821 ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3822 a->reallocs = 0; 3823 A->info.mallocs += 0; 3824 A->info.nz_unneeded = 0; 3825 A->assembled = A->was_assembled = PETSC_TRUE; 3826 A->num_ass++; 3827 PetscFunctionReturn(0); 3828 } 3829 3830 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3831 { 3832 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3833 PetscErrorCode ierr; 3834 3835 PetscFunctionBegin; 3836 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3837 if (!cusp) PetscFunctionReturn(0); 3838 if (destroy) { 3839 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 3840 delete cusp->csr2csc_i; 3841 cusp->csr2csc_i = NULL; 3842 } 3843 A->transupdated = PETSC_FALSE; 3844 PetscFunctionReturn(0); 3845 } 3846 3847 #include <thrust/binary_search.h> 3848 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 3849 { 3850 PetscErrorCode ierr; 3851 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3852 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3853 PetscInt cooPerm_n, nzr = 0; 3854 cudaError_t cerr; 3855 3856 PetscFunctionBegin; 3857 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 3858 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 3859 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 3860 if (n != cooPerm_n) { 3861 delete cusp->cooPerm; 3862 delete cusp->cooPerm_a; 3863 cusp->cooPerm = NULL; 3864 cusp->cooPerm_a = NULL; 3865 } 3866 if (n) { 3867 THRUSTINTARRAY d_i(n); 3868 THRUSTINTARRAY d_j(n); 3869 THRUSTINTARRAY ii(A->rmap->n); 3870 3871 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 3872 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 3873 3874 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 3875 d_i.assign(coo_i,coo_i+n); 3876 d_j.assign(coo_j,coo_j+n); 3877 3878 /* Ex. 3879 n = 6 3880 coo_i = [3,3,1,4,1,4] 3881 coo_j = [3,2,2,5,2,6] 3882 */ 3883 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 3884 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 3885 3886 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3887 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3888 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3889 *cusp->cooPerm_a = d_i; /* copy the sorted array */ 3890 THRUSTINTARRAY w = d_j; 3891 3892 /* 3893 d_i = [1,1,3,3,4,4] 3894 d_j = [2,2,2,3,5,6] 3895 cooPerm = [2,4,1,0,3,5] 3896 */ 3897 auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3898 3899 /* 3900 d_i = [1,3,3,4,4,x] 3901 ^ekey 3902 d_j = [2,2,3,5,6,x] 3903 ^nekye 3904 */ 3905 if (nekey == ekey) { /* all entries are unique */ 3906 delete cusp->cooPerm_a; 3907 cusp->cooPerm_a = NULL; 3908 } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3909 /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3910 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3911 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3912 (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 3913 w[0] = 0; 3914 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3915 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 3916 } 3917 thrust::counting_iterator<PetscInt> search_begin(0); 3918 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3919 search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3920 ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 3921 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3922 3923 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 3924 a->singlemalloc = PETSC_FALSE; 3925 a->free_a = PETSC_TRUE; 3926 a->free_ij = PETSC_TRUE; 3927 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 3928 a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 3929 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3930 a->nz = a->maxnz = a->i[A->rmap->n]; 3931 a->rmax = 0; 3932 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 3933 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 3934 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3935 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 3936 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 3937 for (PetscInt i = 0; i < A->rmap->n; i++) { 3938 const PetscInt nnzr = a->i[i+1] - a->i[i]; 3939 nzr += (PetscInt)!!(nnzr); 3940 a->ilen[i] = a->imax[i] = nnzr; 3941 a->rmax = PetscMax(a->rmax,nnzr); 3942 } 3943 a->nonzerorowcnt = nzr; 3944 A->preallocated = PETSC_TRUE; 3945 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3946 ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 3947 } else { 3948 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 3949 } 3950 ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 3951 3952 /* We want to allocate the CUSPARSE struct for matvec now. 3953 The code is so convoluted now that I prefer to copy zeros */ 3954 ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 3955 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 3956 A->offloadmask = PETSC_OFFLOAD_CPU; 3957 A->nonzerostate++; 3958 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3959 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr); 3960 3961 A->assembled = PETSC_FALSE; 3962 A->was_assembled = PETSC_FALSE; 3963 PetscFunctionReturn(0); 3964 } 3965 3966 /*@C 3967 MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 3968 3969 Not collective 3970 3971 Input Parameters: 3972 + A - the matrix 3973 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 3974 3975 Output Parameters: 3976 + ia - the CSR row pointers 3977 - ja - the CSR column indices 3978 3979 Level: developer 3980 3981 Notes: 3982 When compressed is true, the CSR structure does not contain empty rows 3983 3984 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead() 3985 @*/ 3986 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 3987 { 3988 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3989 CsrMatrix *csr; 3990 PetscErrorCode ierr; 3991 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3992 3993 PetscFunctionBegin; 3994 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3995 if (!i || !j) PetscFunctionReturn(0); 3996 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3997 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3998 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3999 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4000 csr = (CsrMatrix*)cusp->mat->mat; 4001 if (i) { 4002 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4003 if (!cusp->rowoffsets_gpu) { 4004 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4005 cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4006 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4007 } 4008 *i = cusp->rowoffsets_gpu->data().get(); 4009 } else *i = csr->row_offsets->data().get(); 4010 } 4011 if (j) *j = csr->column_indices->data().get(); 4012 PetscFunctionReturn(0); 4013 } 4014 4015 /*@C 4016 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 4017 4018 Not collective 4019 4020 Input Parameters: 4021 + A - the matrix 4022 - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 4023 4024 Output Parameters: 4025 + ia - the CSR row pointers 4026 - ja - the CSR column indices 4027 4028 Level: developer 4029 4030 .seealso: MatSeqAIJCUSPARSEGetIJ() 4031 @*/ 4032 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 4033 { 4034 PetscFunctionBegin; 4035 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4036 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4037 if (i) *i = NULL; 4038 if (j) *j = NULL; 4039 PetscFunctionReturn(0); 4040 } 4041 4042 /*@C 4043 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4044 4045 Not Collective 4046 4047 Input Parameter: 4048 . A - a MATSEQAIJCUSPARSE matrix 4049 4050 Output Parameter: 4051 . a - pointer to the device data 4052 4053 Level: developer 4054 4055 Notes: may trigger host-device copies if up-to-date matrix data is on host 4056 4057 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead() 4058 @*/ 4059 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4060 { 4061 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4062 CsrMatrix *csr; 4063 PetscErrorCode ierr; 4064 4065 PetscFunctionBegin; 4066 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4067 PetscValidPointer(a,2); 4068 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4069 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4070 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4071 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4072 csr = (CsrMatrix*)cusp->mat->mat; 4073 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4074 *a = csr->values->data().get(); 4075 PetscFunctionReturn(0); 4076 } 4077 4078 /*@C 4079 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 4080 4081 Not Collective 4082 4083 Input Parameter: 4084 . A - a MATSEQAIJCUSPARSE matrix 4085 4086 Output Parameter: 4087 . a - pointer to the device data 4088 4089 Level: developer 4090 4091 .seealso: MatSeqAIJCUSPARSEGetArrayRead() 4092 @*/ 4093 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4094 { 4095 PetscFunctionBegin; 4096 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4097 PetscValidPointer(a,2); 4098 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4099 *a = NULL; 4100 PetscFunctionReturn(0); 4101 } 4102 4103 /*@C 4104 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4105 4106 Not Collective 4107 4108 Input Parameter: 4109 . A - a MATSEQAIJCUSPARSE matrix 4110 4111 Output Parameter: 4112 . a - pointer to the device data 4113 4114 Level: developer 4115 4116 Notes: may trigger host-device copies if up-to-date matrix data is on host 4117 4118 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray() 4119 @*/ 4120 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4121 { 4122 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4123 CsrMatrix *csr; 4124 PetscErrorCode ierr; 4125 4126 PetscFunctionBegin; 4127 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4128 PetscValidPointer(a,2); 4129 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4130 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4131 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4132 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4133 csr = (CsrMatrix*)cusp->mat->mat; 4134 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4135 *a = csr->values->data().get(); 4136 A->offloadmask = PETSC_OFFLOAD_GPU; 4137 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4138 PetscFunctionReturn(0); 4139 } 4140 /*@C 4141 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4142 4143 Not Collective 4144 4145 Input Parameter: 4146 . A - a MATSEQAIJCUSPARSE matrix 4147 4148 Output Parameter: 4149 . a - pointer to the device data 4150 4151 Level: developer 4152 4153 .seealso: MatSeqAIJCUSPARSEGetArray() 4154 @*/ 4155 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4156 { 4157 PetscErrorCode ierr; 4158 4159 PetscFunctionBegin; 4160 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4161 PetscValidPointer(a,2); 4162 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4163 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4164 *a = NULL; 4165 PetscFunctionReturn(0); 4166 } 4167 4168 /*@C 4169 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 4170 4171 Not Collective 4172 4173 Input Parameter: 4174 . A - a MATSEQAIJCUSPARSE matrix 4175 4176 Output Parameter: 4177 . a - pointer to the device data 4178 4179 Level: developer 4180 4181 Notes: does not trigger host-device copies and flags data validity on the GPU 4182 4183 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite() 4184 @*/ 4185 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4186 { 4187 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4188 CsrMatrix *csr; 4189 PetscErrorCode ierr; 4190 4191 PetscFunctionBegin; 4192 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4193 PetscValidPointer(a,2); 4194 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4195 if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4196 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4197 csr = (CsrMatrix*)cusp->mat->mat; 4198 if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4199 *a = csr->values->data().get(); 4200 A->offloadmask = PETSC_OFFLOAD_GPU; 4201 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr); 4202 PetscFunctionReturn(0); 4203 } 4204 4205 /*@C 4206 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 4207 4208 Not Collective 4209 4210 Input Parameter: 4211 . A - a MATSEQAIJCUSPARSE matrix 4212 4213 Output Parameter: 4214 . a - pointer to the device data 4215 4216 Level: developer 4217 4218 .seealso: MatSeqAIJCUSPARSEGetArrayWrite() 4219 @*/ 4220 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4221 { 4222 PetscErrorCode ierr; 4223 4224 PetscFunctionBegin; 4225 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4226 PetscValidPointer(a,2); 4227 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4228 ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 4229 *a = NULL; 4230 PetscFunctionReturn(0); 4231 } 4232 4233 struct IJCompare4 4234 { 4235 __host__ __device__ 4236 inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4237 { 4238 if (t1.get<0>() < t2.get<0>()) return true; 4239 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4240 return false; 4241 } 4242 }; 4243 4244 struct Shift 4245 { 4246 int _shift; 4247 4248 Shift(int shift) : _shift(shift) {} 4249 __host__ __device__ 4250 inline int operator() (const int &c) 4251 { 4252 return c + _shift; 4253 } 4254 }; 4255 4256 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4257 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4258 { 4259 PetscErrorCode ierr; 4260 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4261 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4262 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4263 CsrMatrix *Acsr,*Bcsr,*Ccsr; 4264 PetscInt Annz,Bnnz; 4265 cusparseStatus_t stat; 4266 PetscInt i,m,n,zero = 0; 4267 cudaError_t cerr; 4268 4269 PetscFunctionBegin; 4270 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4271 PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4272 PetscValidPointer(C,4); 4273 PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4274 PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 4275 if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 4276 if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4277 if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4278 if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4279 if (reuse == MAT_INITIAL_MATRIX) { 4280 m = A->rmap->n; 4281 n = A->cmap->n + B->cmap->n; 4282 ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 4283 ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 4284 ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4285 c = (Mat_SeqAIJ*)(*C)->data; 4286 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4287 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4288 Ccsr = new CsrMatrix; 4289 Cmat->cprowIndices = NULL; 4290 c->compressedrow.use = PETSC_FALSE; 4291 c->compressedrow.nrows = 0; 4292 c->compressedrow.i = NULL; 4293 c->compressedrow.rindex = NULL; 4294 Ccusp->workVector = NULL; 4295 Ccusp->nrows = m; 4296 Ccusp->mat = Cmat; 4297 Ccusp->mat->mat = Ccsr; 4298 Ccsr->num_rows = m; 4299 Ccsr->num_cols = n; 4300 stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 4301 stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4302 stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4303 cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4304 cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4305 cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4306 cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4307 cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4308 cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4309 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4310 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4311 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4312 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4313 4314 Acsr = (CsrMatrix*)Acusp->mat->mat; 4315 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4316 Annz = (PetscInt)Acsr->column_indices->size(); 4317 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4318 c->nz = Annz + Bnnz; 4319 Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4320 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4321 Ccsr->values = new THRUSTARRAY(c->nz); 4322 Ccsr->num_entries = c->nz; 4323 Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4324 if (c->nz) { 4325 auto Acoo = new THRUSTINTARRAY32(Annz); 4326 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4327 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4328 THRUSTINTARRAY32 *Aroff,*Broff; 4329 4330 if (a->compressedrow.use) { /* need full row offset */ 4331 if (!Acusp->rowoffsets_gpu) { 4332 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4333 Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 4334 ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4335 } 4336 Aroff = Acusp->rowoffsets_gpu; 4337 } else Aroff = Acsr->row_offsets; 4338 if (b->compressedrow.use) { /* need full row offset */ 4339 if (!Bcusp->rowoffsets_gpu) { 4340 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4341 Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 4342 ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 4343 } 4344 Broff = Bcusp->rowoffsets_gpu; 4345 } else Broff = Bcsr->row_offsets; 4346 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4347 stat = cusparseXcsr2coo(Acusp->handle, 4348 Aroff->data().get(), 4349 Annz, 4350 m, 4351 Acoo->data().get(), 4352 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4353 stat = cusparseXcsr2coo(Bcusp->handle, 4354 Broff->data().get(), 4355 Bnnz, 4356 m, 4357 Bcoo->data().get(), 4358 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4359 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4360 auto Aperm = thrust::make_constant_iterator(1); 4361 auto Bperm = thrust::make_constant_iterator(0); 4362 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4363 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4364 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 4365 #else 4366 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4367 auto Bcib = Bcsr->column_indices->begin(); 4368 auto Bcie = Bcsr->column_indices->end(); 4369 thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 4370 #endif 4371 auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 4372 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 4373 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 4374 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 4375 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 4376 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4377 auto p1 = Ccusp->cooPerm->begin(); 4378 auto p2 = Ccusp->cooPerm->begin(); 4379 thrust::advance(p2,Annz); 4380 PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 4381 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 4382 thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 4383 #endif 4384 auto cci = thrust::make_counting_iterator(zero); 4385 auto cce = thrust::make_counting_iterator(c->nz); 4386 #if 0 //Errors on SUMMIT cuda 11.1.0 4387 PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4388 #else 4389 auto pred = thrust::identity<int>(); 4390 PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 4391 PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 4392 #endif 4393 stat = cusparseXcoo2csr(Ccusp->handle, 4394 Ccoo->data().get(), 4395 c->nz, 4396 m, 4397 Ccsr->row_offsets->data().get(), 4398 CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4399 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4400 delete wPerm; 4401 delete Acoo; 4402 delete Bcoo; 4403 delete Ccoo; 4404 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4405 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4406 Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4407 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4408 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4409 #endif 4410 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4411 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr); 4412 ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr); 4413 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4414 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4415 CsrMatrix *CcsrT = new CsrMatrix; 4416 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4417 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4418 4419 (*C)->form_explicit_transpose = PETSC_TRUE; 4420 (*C)->transupdated = PETSC_TRUE; 4421 Ccusp->rowoffsets_gpu = NULL; 4422 CmatT->cprowIndices = NULL; 4423 CmatT->mat = CcsrT; 4424 CcsrT->num_rows = n; 4425 CcsrT->num_cols = m; 4426 CcsrT->num_entries = c->nz; 4427 4428 CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4429 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4430 CcsrT->values = new THRUSTARRAY(c->nz); 4431 4432 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4433 auto rT = CcsrT->row_offsets->begin(); 4434 if (AT) { 4435 rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4436 thrust::advance(rT,-1); 4437 } 4438 if (BT) { 4439 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4440 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4441 thrust::copy(titb,tite,rT); 4442 } 4443 auto cT = CcsrT->column_indices->begin(); 4444 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4445 if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4446 auto vT = CcsrT->values->begin(); 4447 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4448 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4449 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4450 4451 stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 4452 stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4453 stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 4454 cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 4455 cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 4456 cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 4457 cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4458 cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4459 cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 4460 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4461 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4462 CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4463 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 4464 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 4465 #endif 4466 Ccusp->matTranspose = CmatT; 4467 } 4468 } 4469 4470 c->singlemalloc = PETSC_FALSE; 4471 c->free_a = PETSC_TRUE; 4472 c->free_ij = PETSC_TRUE; 4473 ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 4474 ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 4475 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4476 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4477 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4478 ii = *Ccsr->row_offsets; 4479 jj = *Ccsr->column_indices; 4480 cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4481 cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4482 } else { 4483 cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4484 cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4485 } 4486 ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 4487 ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 4488 ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 4489 c->maxnz = c->nz; 4490 c->nonzerorowcnt = 0; 4491 c->rmax = 0; 4492 for (i = 0; i < m; i++) { 4493 const PetscInt nn = c->i[i+1] - c->i[i]; 4494 c->ilen[i] = c->imax[i] = nn; 4495 c->nonzerorowcnt += (PetscInt)!!nn; 4496 c->rmax = PetscMax(c->rmax,nn); 4497 } 4498 ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 4499 ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 4500 (*C)->nonzerostate++; 4501 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 4502 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 4503 Ccusp->nonzerostate = (*C)->nonzerostate; 4504 (*C)->preallocated = PETSC_TRUE; 4505 } else { 4506 if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 4507 c = (Mat_SeqAIJ*)(*C)->data; 4508 if (c->nz) { 4509 Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4510 if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4511 if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4512 if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 4513 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 4514 ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 4515 if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4516 if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4517 Acsr = (CsrMatrix*)Acusp->mat->mat; 4518 Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4519 Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4520 if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 4521 if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4522 if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4523 if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 4524 if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4525 auto pmid = Ccusp->cooPerm->begin(); 4526 thrust::advance(pmid,Acsr->num_entries); 4527 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 4528 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4529 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4530 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4531 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4532 thrust::for_each(zibait,zieait,VecCUDAEquals()); 4533 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4534 thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4535 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4536 thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4537 thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 4538 ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr); 4539 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4540 if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4541 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4542 CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4543 CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4544 CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4545 auto vT = CcsrT->values->begin(); 4546 if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4547 if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4548 (*C)->transupdated = PETSC_TRUE; 4549 } 4550 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4551 } 4552 } 4553 ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4554 (*C)->assembled = PETSC_TRUE; 4555 (*C)->was_assembled = PETSC_FALSE; 4556 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4557 PetscFunctionReturn(0); 4558 } 4559 4560 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4561 { 4562 PetscErrorCode ierr; 4563 bool dmem; 4564 const PetscScalar *av; 4565 cudaError_t cerr; 4566 4567 PetscFunctionBegin; 4568 dmem = isCudaMem(v); 4569 ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr); 4570 if (n && idx) { 4571 THRUSTINTARRAY widx(n); 4572 widx.assign(idx,idx+n); 4573 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 4574 4575 THRUSTARRAY *w = NULL; 4576 thrust::device_ptr<PetscScalar> dv; 4577 if (dmem) { 4578 dv = thrust::device_pointer_cast(v); 4579 } else { 4580 w = new THRUSTARRAY(n); 4581 dv = w->data(); 4582 } 4583 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4584 4585 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4586 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4587 thrust::for_each(zibit,zieit,VecCUDAEquals()); 4588 if (w) { 4589 cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4590 } 4591 delete w; 4592 } else { 4593 cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 4594 } 4595 if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); } 4596 ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr); 4597 PetscFunctionReturn(0); 4598 } 4599