1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 8 9 #include <petscconf.h> 10 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11 #include <../src/mat/impls/sbaij/seq/sbaij.h> 12 #include <../src/vec/vec/impls/dvecimpl.h> 13 #include <petsc/private/vecimpl.h> 14 #undef VecType 15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16 17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19 /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21 22 typedef enum { 23 CUSPARSE_MV_ALG_DEFAULT = 0, 24 CUSPARSE_COOMV_ALG = 1, 25 CUSPARSE_CSRMV_ALG1 = 2, 26 CUSPARSE_CSRMV_ALG2 = 3 27 } cusparseSpMVAlg_t; 28 29 typedef enum { 30 CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31 CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32 CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33 CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34 CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35 CUSPARSE_SPMM_ALG_DEFAULT = 0, 36 CUSPARSE_SPMM_COO_ALG1 = 1, 37 CUSPARSE_SPMM_COO_ALG2 = 2, 38 CUSPARSE_SPMM_COO_ALG3 = 3, 39 CUSPARSE_SPMM_COO_ALG4 = 5, 40 CUSPARSE_SPMM_CSR_ALG1 = 4, 41 CUSPARSE_SPMM_CSR_ALG2 = 6, 42 } cusparseSpMMAlg_t; 43 44 typedef enum { 45 CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46 CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47 } cusparseCsr2CscAlg_t; 48 */ 49 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52 #endif 53 54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57 58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61 62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 74 75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 81 82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 84 85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 86 { 87 cusparseStatus_t stat; 88 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 89 90 PetscFunctionBegin; 91 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 92 cusparsestruct->stream = stream; 93 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 94 PetscFunctionReturn(0); 95 } 96 97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 98 { 99 cusparseStatus_t stat; 100 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 101 102 PetscFunctionBegin; 103 if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 104 if (cusparsestruct->handle != handle) { 105 if (cusparsestruct->handle) { 106 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 107 } 108 cusparsestruct->handle = handle; 109 } 110 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 111 PetscFunctionReturn(0); 112 } 113 114 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 117 PetscBool flg; 118 PetscErrorCode ierr; 119 120 PetscFunctionBegin; 121 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 122 if (!flg || !cusparsestruct) PetscFunctionReturn(0); 123 if (cusparsestruct->handle) cusparsestruct->handle = 0; 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 128 { 129 PetscFunctionBegin; 130 *type = MATSOLVERCUSPARSE; 131 PetscFunctionReturn(0); 132 } 133 134 /*MC 135 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 136 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 137 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 138 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 139 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 140 algorithms are not recommended. This class does NOT support direct solver operations. 141 142 Level: beginner 143 144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 145 M*/ 146 147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 148 { 149 PetscErrorCode ierr; 150 PetscInt n = A->rmap->n; 151 152 PetscFunctionBegin; 153 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 154 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 155 (*B)->factortype = ftype; 156 (*B)->useordering = PETSC_TRUE; 157 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 158 159 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 160 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 161 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 162 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 163 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 164 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 165 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 166 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 167 168 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 169 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 174 { 175 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 176 177 PetscFunctionBegin; 178 switch (op) { 179 case MAT_CUSPARSE_MULT: 180 cusparsestruct->format = format; 181 break; 182 case MAT_CUSPARSE_ALL: 183 cusparsestruct->format = format; 184 break; 185 default: 186 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 187 } 188 PetscFunctionReturn(0); 189 } 190 191 /*@ 192 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 193 operation. Only the MatMult operation can use different GPU storage formats 194 for MPIAIJCUSPARSE matrices. 195 Not Collective 196 197 Input Parameters: 198 + A - Matrix of type SEQAIJCUSPARSE 199 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 200 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 201 202 Output Parameter: 203 204 Level: intermediate 205 206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 207 @*/ 208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 214 ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 215 PetscFunctionReturn(0); 216 } 217 218 /*@ 219 MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 220 221 Collective on mat 222 223 Input Parameters: 224 + A - Matrix of type SEQAIJCUSPARSE 225 - transgen - the boolean flag 226 227 Level: intermediate 228 229 .seealso: MATSEQAIJCUSPARSE 230 @*/ 231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 232 { 233 PetscErrorCode ierr; 234 PetscBool flg; 235 236 PetscFunctionBegin; 237 PetscValidHeaderSpecific(A,MAT_CLASSID,1); 238 ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 239 if (flg) { 240 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 241 242 if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 243 cusp->transgen = transgen; 244 if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 245 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 246 } 247 } 248 PetscFunctionReturn(0); 249 } 250 251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 252 { 253 PetscErrorCode ierr; 254 MatCUSPARSEStorageFormat format; 255 PetscBool flg; 256 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 257 258 PetscFunctionBegin; 259 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 260 if (A->factortype == MAT_FACTOR_NONE) { 261 PetscBool transgen = cusparsestruct->transgen; 262 263 ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 264 if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 265 266 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 267 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 268 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 269 270 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 271 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272 if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 273 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 274 cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 275 ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 276 "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 277 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 278 if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 279 280 cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 281 ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 282 "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 283 if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 284 285 cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 286 ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 287 "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 288 if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 289 #endif 290 } 291 ierr = PetscOptionsTail();CHKERRQ(ierr); 292 PetscFunctionReturn(0); 293 } 294 295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 296 { 297 PetscErrorCode ierr; 298 299 PetscFunctionBegin; 300 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 301 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 302 PetscFunctionReturn(0); 303 } 304 305 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 306 { 307 PetscErrorCode ierr; 308 309 PetscFunctionBegin; 310 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 311 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 312 PetscFunctionReturn(0); 313 } 314 315 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 316 { 317 PetscErrorCode ierr; 318 319 PetscFunctionBegin; 320 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 321 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 322 PetscFunctionReturn(0); 323 } 324 325 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 326 { 327 PetscErrorCode ierr; 328 329 PetscFunctionBegin; 330 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332 PetscFunctionReturn(0); 333 } 334 335 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 336 { 337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 338 PetscInt n = A->rmap->n; 339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 340 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 341 cusparseStatus_t stat; 342 const PetscInt *ai = a->i,*aj = a->j,*vi; 343 const MatScalar *aa = a->a,*v; 344 PetscInt *AiLo, *AjLo; 345 PetscScalar *AALo; 346 PetscInt i,nz, nzLower, offset, rowOffset; 347 PetscErrorCode ierr; 348 cudaError_t cerr; 349 350 PetscFunctionBegin; 351 if (!n) PetscFunctionReturn(0); 352 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 353 try { 354 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 355 nzLower=n+ai[n]-ai[1]; 356 357 /* Allocate Space for the lower triangular matrix */ 358 cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 359 cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 360 cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 361 362 /* Fill the lower triangular matrix */ 363 AiLo[0] = (PetscInt) 0; 364 AiLo[n] = nzLower; 365 AjLo[0] = (PetscInt) 0; 366 AALo[0] = (MatScalar) 1.0; 367 v = aa; 368 vi = aj; 369 offset = 1; 370 rowOffset= 1; 371 for (i=1; i<n; i++) { 372 nz = ai[i+1] - ai[i]; 373 /* additional 1 for the term on the diagonal */ 374 AiLo[i] = rowOffset; 375 rowOffset += nz+1; 376 377 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 378 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 379 380 offset += nz; 381 AjLo[offset] = (PetscInt) i; 382 AALo[offset] = (MatScalar) 1.0; 383 offset += 1; 384 385 v += nz; 386 vi += nz; 387 } 388 389 /* allocate space for the triangular factor information */ 390 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 391 392 /* Create the matrix description */ 393 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 394 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 395 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 396 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 397 #else 398 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 399 #endif 400 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 401 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 402 403 /* set the operation */ 404 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 405 406 /* set the matrix */ 407 loTriFactor->csrMat = new CsrMatrix; 408 loTriFactor->csrMat->num_rows = n; 409 loTriFactor->csrMat->num_cols = n; 410 loTriFactor->csrMat->num_entries = nzLower; 411 412 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 413 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 414 415 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 416 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 417 418 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 419 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 420 421 /* Create the solve analysis information */ 422 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 423 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 424 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 425 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 428 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 429 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 430 #endif 431 432 /* perform the solve analysis */ 433 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 434 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 435 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 436 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 438 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 439 #endif 440 );CHKERRCUSPARSE(stat); 441 442 /* assign the pointer. Is this really necessary? */ 443 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 444 445 cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 446 cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 447 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 448 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 449 } catch(char *ex) { 450 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 451 } 452 } 453 PetscFunctionReturn(0); 454 } 455 456 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 457 { 458 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 459 PetscInt n = A->rmap->n; 460 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 461 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 462 cusparseStatus_t stat; 463 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 464 const MatScalar *aa = a->a,*v; 465 PetscInt *AiUp, *AjUp; 466 PetscScalar *AAUp; 467 PetscInt i,nz, nzUpper, offset; 468 PetscErrorCode ierr; 469 cudaError_t cerr; 470 471 PetscFunctionBegin; 472 if (!n) PetscFunctionReturn(0); 473 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 474 try { 475 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 476 nzUpper = adiag[0]-adiag[n]; 477 478 /* Allocate Space for the upper triangular matrix */ 479 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 480 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 481 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 482 483 /* Fill the upper triangular matrix */ 484 AiUp[0]=(PetscInt) 0; 485 AiUp[n]=nzUpper; 486 offset = nzUpper; 487 for (i=n-1; i>=0; i--) { 488 v = aa + adiag[i+1] + 1; 489 vi = aj + adiag[i+1] + 1; 490 491 /* number of elements NOT on the diagonal */ 492 nz = adiag[i] - adiag[i+1]-1; 493 494 /* decrement the offset */ 495 offset -= (nz+1); 496 497 /* first, set the diagonal elements */ 498 AjUp[offset] = (PetscInt) i; 499 AAUp[offset] = (MatScalar)1./v[nz]; 500 AiUp[i] = AiUp[i+1] - (nz+1); 501 502 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 503 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 504 } 505 506 /* allocate space for the triangular factor information */ 507 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 508 509 /* Create the matrix description */ 510 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 511 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 513 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 514 #else 515 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 516 #endif 517 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 518 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 519 520 /* set the operation */ 521 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 522 523 /* set the matrix */ 524 upTriFactor->csrMat = new CsrMatrix; 525 upTriFactor->csrMat->num_rows = n; 526 upTriFactor->csrMat->num_cols = n; 527 upTriFactor->csrMat->num_entries = nzUpper; 528 529 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 530 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 531 532 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 533 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 534 535 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 536 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 537 538 /* Create the solve analysis information */ 539 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 540 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 541 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 542 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 545 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 546 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 547 #endif 548 549 /* perform the solve analysis */ 550 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 551 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 552 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 553 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 554 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 555 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 556 #endif 557 );CHKERRCUSPARSE(stat); 558 559 /* assign the pointer. Is this really necessary? */ 560 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 561 562 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 563 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 564 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 565 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 566 } catch(char *ex) { 567 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 568 } 569 } 570 PetscFunctionReturn(0); 571 } 572 573 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 574 { 575 PetscErrorCode ierr; 576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 577 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 578 IS isrow = a->row,iscol = a->icol; 579 PetscBool row_identity,col_identity; 580 const PetscInt *r,*c; 581 PetscInt n = A->rmap->n; 582 583 PetscFunctionBegin; 584 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 585 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 586 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 587 588 cusparseTriFactors->workVector = new THRUSTARRAY(n); 589 cusparseTriFactors->nnz=a->nz; 590 591 A->offloadmask = PETSC_OFFLOAD_BOTH; 592 /* lower triangular indices */ 593 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 594 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 595 if (!row_identity) { 596 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 597 cusparseTriFactors->rpermIndices->assign(r, r+n); 598 } 599 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 600 601 /* upper triangular indices */ 602 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 603 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 604 if (!col_identity) { 605 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 606 cusparseTriFactors->cpermIndices->assign(c, c+n); 607 } 608 609 if (!row_identity && !col_identity) { 610 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 611 } else if(!row_identity) { 612 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 613 } else if(!col_identity) { 614 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 615 } 616 617 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 618 PetscFunctionReturn(0); 619 } 620 621 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 622 { 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 626 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 627 cusparseStatus_t stat; 628 PetscErrorCode ierr; 629 cudaError_t cerr; 630 PetscInt *AiUp, *AjUp; 631 PetscScalar *AAUp; 632 PetscScalar *AALo; 633 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 634 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 635 const PetscInt *ai = b->i,*aj = b->j,*vj; 636 const MatScalar *aa = b->a,*v; 637 638 PetscFunctionBegin; 639 if (!n) PetscFunctionReturn(0); 640 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 641 try { 642 /* Allocate Space for the upper triangular matrix */ 643 cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 644 cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 645 cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 646 cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 647 648 /* Fill the upper triangular matrix */ 649 AiUp[0]=(PetscInt) 0; 650 AiUp[n]=nzUpper; 651 offset = 0; 652 for (i=0; i<n; i++) { 653 /* set the pointers */ 654 v = aa + ai[i]; 655 vj = aj + ai[i]; 656 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 657 658 /* first, set the diagonal elements */ 659 AjUp[offset] = (PetscInt) i; 660 AAUp[offset] = (MatScalar)1.0/v[nz]; 661 AiUp[i] = offset; 662 AALo[offset] = (MatScalar)1.0/v[nz]; 663 664 offset+=1; 665 if (nz>0) { 666 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 667 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 668 for (j=offset; j<offset+nz; j++) { 669 AAUp[j] = -AAUp[j]; 670 AALo[j] = AAUp[j]/v[nz]; 671 } 672 offset+=nz; 673 } 674 } 675 676 /* allocate space for the triangular factor information */ 677 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 678 679 /* Create the matrix description */ 680 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 681 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 682 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 683 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 684 #else 685 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 686 #endif 687 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 688 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 689 690 /* set the matrix */ 691 upTriFactor->csrMat = new CsrMatrix; 692 upTriFactor->csrMat->num_rows = A->rmap->n; 693 upTriFactor->csrMat->num_cols = A->cmap->n; 694 upTriFactor->csrMat->num_entries = a->nz; 695 696 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 697 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 698 699 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 700 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 701 702 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 703 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 704 705 /* set the operation */ 706 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 707 708 /* Create the solve analysis information */ 709 stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 710 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 711 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 712 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 715 &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 716 cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 717 #endif 718 719 /* perform the solve analysis */ 720 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 721 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 722 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 723 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 724 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 725 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 726 #endif 727 );CHKERRCUSPARSE(stat); 728 729 /* assign the pointer. Is this really necessary? */ 730 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 731 732 /* allocate space for the triangular factor information */ 733 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 734 735 /* Create the matrix description */ 736 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 737 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 738 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 739 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 740 #else 741 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 742 #endif 743 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 744 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 745 746 /* set the operation */ 747 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 748 749 /* set the matrix */ 750 loTriFactor->csrMat = new CsrMatrix; 751 loTriFactor->csrMat->num_rows = A->rmap->n; 752 loTriFactor->csrMat->num_cols = A->cmap->n; 753 loTriFactor->csrMat->num_entries = a->nz; 754 755 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 756 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 757 758 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 759 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 760 761 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 762 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 763 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 764 765 /* Create the solve analysis information */ 766 stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 767 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 768 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 769 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 772 &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 773 cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 774 #endif 775 776 /* perform the solve analysis */ 777 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 778 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 779 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 780 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 781 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 782 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 783 #endif 784 );CHKERRCUSPARSE(stat); 785 786 /* assign the pointer. Is this really necessary? */ 787 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 788 789 A->offloadmask = PETSC_OFFLOAD_BOTH; 790 cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 791 cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 792 cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 793 cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 794 } catch(char *ex) { 795 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 796 } 797 } 798 PetscFunctionReturn(0); 799 } 800 801 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 802 { 803 PetscErrorCode ierr; 804 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 805 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 806 IS ip = a->row; 807 const PetscInt *rip; 808 PetscBool perm_identity; 809 PetscInt n = A->rmap->n; 810 811 PetscFunctionBegin; 812 ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 813 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 814 cusparseTriFactors->workVector = new THRUSTARRAY(n); 815 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 816 817 /* lower triangular indices */ 818 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 819 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 820 if (!perm_identity) { 821 IS iip; 822 const PetscInt *irip; 823 824 ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 825 ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 826 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 827 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 828 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 829 cusparseTriFactors->cpermIndices->assign(irip, irip+n); 830 ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 831 ierr = ISDestroy(&iip);CHKERRQ(ierr); 832 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 833 } 834 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 835 PetscFunctionReturn(0); 836 } 837 838 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 839 { 840 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 841 IS isrow = b->row,iscol = b->col; 842 PetscBool row_identity,col_identity; 843 PetscErrorCode ierr; 844 845 PetscFunctionBegin; 846 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 847 B->offloadmask = PETSC_OFFLOAD_CPU; 848 /* determine which version of MatSolve needs to be used. */ 849 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 850 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 851 if (row_identity && col_identity) { 852 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 853 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 854 B->ops->matsolve = NULL; 855 B->ops->matsolvetranspose = NULL; 856 } else { 857 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 858 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 859 B->ops->matsolve = NULL; 860 B->ops->matsolvetranspose = NULL; 861 } 862 863 /* get the triangular factors */ 864 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 865 PetscFunctionReturn(0); 866 } 867 868 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 869 { 870 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 871 IS ip = b->row; 872 PetscBool perm_identity; 873 PetscErrorCode ierr; 874 875 PetscFunctionBegin; 876 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 877 B->offloadmask = PETSC_OFFLOAD_CPU; 878 /* determine which version of MatSolve needs to be used. */ 879 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 880 if (perm_identity) { 881 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 882 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 883 B->ops->matsolve = NULL; 884 B->ops->matsolvetranspose = NULL; 885 } else { 886 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 887 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 888 B->ops->matsolve = NULL; 889 B->ops->matsolvetranspose = NULL; 890 } 891 892 /* get the triangular factors */ 893 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 898 { 899 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 900 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 901 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 902 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 903 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 904 cusparseStatus_t stat; 905 cusparseIndexBase_t indexBase; 906 cusparseMatrixType_t matrixType; 907 cusparseFillMode_t fillMode; 908 cusparseDiagType_t diagType; 909 910 PetscFunctionBegin; 911 912 /*********************************************/ 913 /* Now the Transpose of the Lower Tri Factor */ 914 /*********************************************/ 915 916 /* allocate space for the transpose of the lower triangular factor */ 917 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 918 919 /* set the matrix descriptors of the lower triangular factor */ 920 matrixType = cusparseGetMatType(loTriFactor->descr); 921 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 922 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 923 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 924 diagType = cusparseGetMatDiagType(loTriFactor->descr); 925 926 /* Create the matrix description */ 927 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 928 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 929 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 930 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 931 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 932 933 /* set the operation */ 934 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 935 936 /* allocate GPU space for the CSC of the lower triangular factor*/ 937 loTriFactorT->csrMat = new CsrMatrix; 938 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 939 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 940 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 941 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 942 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 943 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 944 945 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 946 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 947 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 948 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 949 loTriFactor->csrMat->values->data().get(), 950 loTriFactor->csrMat->row_offsets->data().get(), 951 loTriFactor->csrMat->column_indices->data().get(), 952 loTriFactorT->csrMat->values->data().get(), 953 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 954 CUSPARSE_ACTION_NUMERIC,indexBase, 955 CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 956 cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 957 #endif 958 959 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 960 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 961 loTriFactor->csrMat->values->data().get(), 962 loTriFactor->csrMat->row_offsets->data().get(), 963 loTriFactor->csrMat->column_indices->data().get(), 964 loTriFactorT->csrMat->values->data().get(), 965 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 966 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 967 CUSPARSE_ACTION_NUMERIC, indexBase, 968 CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 969 #else 970 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 971 CUSPARSE_ACTION_NUMERIC, indexBase 972 #endif 973 );CHKERRCUSPARSE(stat); 974 975 /* Create the solve analysis information */ 976 stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 977 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 978 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 979 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 980 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 981 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 982 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 983 cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 984 #endif 985 986 /* perform the solve analysis */ 987 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 988 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 989 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 990 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 991 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 992 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 993 #endif 994 );CHKERRCUSPARSE(stat); 995 996 /* assign the pointer. Is this really necessary? */ 997 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 998 999 /*********************************************/ 1000 /* Now the Transpose of the Upper Tri Factor */ 1001 /*********************************************/ 1002 1003 /* allocate space for the transpose of the upper triangular factor */ 1004 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 1005 1006 /* set the matrix descriptors of the upper triangular factor */ 1007 matrixType = cusparseGetMatType(upTriFactor->descr); 1008 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1009 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1010 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1011 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1012 1013 /* Create the matrix description */ 1014 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 1015 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 1016 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 1017 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 1018 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1019 1020 /* set the operation */ 1021 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1022 1023 /* allocate GPU space for the CSC of the upper triangular factor*/ 1024 upTriFactorT->csrMat = new CsrMatrix; 1025 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1026 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1027 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1028 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1029 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1030 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1031 1032 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1033 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1034 stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1035 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1036 upTriFactor->csrMat->values->data().get(), 1037 upTriFactor->csrMat->row_offsets->data().get(), 1038 upTriFactor->csrMat->column_indices->data().get(), 1039 upTriFactorT->csrMat->values->data().get(), 1040 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1041 CUSPARSE_ACTION_NUMERIC,indexBase, 1042 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1043 cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1044 #endif 1045 1046 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1047 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1048 upTriFactor->csrMat->values->data().get(), 1049 upTriFactor->csrMat->row_offsets->data().get(), 1050 upTriFactor->csrMat->column_indices->data().get(), 1051 upTriFactorT->csrMat->values->data().get(), 1052 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1053 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1054 CUSPARSE_ACTION_NUMERIC, indexBase, 1055 CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1056 #else 1057 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1058 CUSPARSE_ACTION_NUMERIC, indexBase 1059 #endif 1060 );CHKERRCUSPARSE(stat); 1061 1062 /* Create the solve analysis information */ 1063 stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1064 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1065 stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1066 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1067 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1068 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1069 &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1070 cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1071 #endif 1072 1073 /* perform the solve analysis */ 1074 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1075 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1076 upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1077 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1078 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1079 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1080 #endif 1081 );CHKERRCUSPARSE(stat); 1082 1083 /* assign the pointer. Is this really necessary? */ 1084 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1085 PetscFunctionReturn(0); 1086 } 1087 1088 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1089 { 1090 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1091 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1092 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1093 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1094 cusparseStatus_t stat; 1095 cusparseIndexBase_t indexBase; 1096 cudaError_t err; 1097 PetscErrorCode ierr; 1098 1099 PetscFunctionBegin; 1100 if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 1101 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1102 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1103 /* create cusparse matrix */ 1104 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1105 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1106 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1107 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 1108 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1109 1110 /* set alpha and beta */ 1111 err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1112 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1113 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1114 err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1115 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1116 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1117 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1118 1119 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1120 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1121 CsrMatrix *matrixT= new CsrMatrix; 1122 matrixT->num_rows = A->cmap->n; 1123 matrixT->num_cols = A->rmap->n; 1124 matrixT->num_entries = a->nz; 1125 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1126 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1127 matrixT->values = new THRUSTARRAY(a->nz); 1128 1129 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 1130 cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1131 1132 /* compute the transpose, i.e. the CSC */ 1133 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1134 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1135 A->cmap->n, matrix->num_entries, 1136 matrix->values->data().get(), 1137 cusparsestruct->rowoffsets_gpu->data().get(), 1138 matrix->column_indices->data().get(), 1139 matrixT->values->data().get(), 1140 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1141 CUSPARSE_ACTION_NUMERIC,indexBase, 1142 cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1143 err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1144 #endif 1145 1146 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1147 A->cmap->n, matrix->num_entries, 1148 matrix->values->data().get(), 1149 cusparsestruct->rowoffsets_gpu->data().get(), 1150 matrix->column_indices->data().get(), 1151 matrixT->values->data().get(), 1152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1153 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1154 CUSPARSE_ACTION_NUMERIC,indexBase, 1155 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1156 #else 1157 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1158 CUSPARSE_ACTION_NUMERIC, indexBase 1159 #endif 1160 );CHKERRCUSPARSE(stat); 1161 matstructT->mat = matrixT; 1162 1163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164 stat = cusparseCreateCsr(&matstructT->matDescr, 1165 matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1166 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1167 matrixT->values->data().get(), 1168 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1169 indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1170 #endif 1171 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1172 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1173 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1174 #else 1175 CsrMatrix *temp = new CsrMatrix; 1176 CsrMatrix *tempT = new CsrMatrix; 1177 /* First convert HYB to CSR */ 1178 temp->num_rows = A->rmap->n; 1179 temp->num_cols = A->cmap->n; 1180 temp->num_entries = a->nz; 1181 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1182 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1183 temp->values = new THRUSTARRAY(a->nz); 1184 1185 stat = cusparse_hyb2csr(cusparsestruct->handle, 1186 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1187 temp->values->data().get(), 1188 temp->row_offsets->data().get(), 1189 temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1190 1191 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1192 tempT->num_rows = A->rmap->n; 1193 tempT->num_cols = A->cmap->n; 1194 tempT->num_entries = a->nz; 1195 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1196 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1197 tempT->values = new THRUSTARRAY(a->nz); 1198 1199 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1200 temp->num_cols, temp->num_entries, 1201 temp->values->data().get(), 1202 temp->row_offsets->data().get(), 1203 temp->column_indices->data().get(), 1204 tempT->values->data().get(), 1205 tempT->column_indices->data().get(), 1206 tempT->row_offsets->data().get(), 1207 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1208 1209 /* Last, convert CSC to HYB */ 1210 cusparseHybMat_t hybMat; 1211 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1212 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1213 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1214 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1215 matstructT->descr, tempT->values->data().get(), 1216 tempT->row_offsets->data().get(), 1217 tempT->column_indices->data().get(), 1218 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1219 1220 /* assign the pointer */ 1221 matstructT->mat = hybMat; 1222 /* delete temporaries */ 1223 if (tempT) { 1224 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1225 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1226 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1227 delete (CsrMatrix*) tempT; 1228 } 1229 if (temp) { 1230 if (temp->values) delete (THRUSTARRAY*) temp->values; 1231 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1232 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1233 delete (CsrMatrix*) temp; 1234 } 1235 #endif 1236 } 1237 err = WaitForCUDA();CHKERRCUDA(err); 1238 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1239 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1240 /* the compressed row indices is not used for matTranspose */ 1241 matstructT->cprowIndices = NULL; 1242 /* assign the pointer */ 1243 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1244 PetscFunctionReturn(0); 1245 } 1246 1247 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1248 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1249 { 1250 PetscInt n = xx->map->n; 1251 const PetscScalar *barray; 1252 PetscScalar *xarray; 1253 thrust::device_ptr<const PetscScalar> bGPU; 1254 thrust::device_ptr<PetscScalar> xGPU; 1255 cusparseStatus_t stat; 1256 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1257 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1258 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1259 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1260 PetscErrorCode ierr; 1261 cudaError_t cerr; 1262 1263 PetscFunctionBegin; 1264 /* Analyze the matrix and create the transpose ... on the fly */ 1265 if (!loTriFactorT && !upTriFactorT) { 1266 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1267 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1268 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1269 } 1270 1271 /* Get the GPU pointers */ 1272 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1273 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1274 xGPU = thrust::device_pointer_cast(xarray); 1275 bGPU = thrust::device_pointer_cast(barray); 1276 1277 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1278 /* First, reorder with the row permutation */ 1279 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1280 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1281 xGPU); 1282 1283 /* First, solve U */ 1284 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1285 upTriFactorT->csrMat->num_rows, 1286 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1287 upTriFactorT->csrMat->num_entries, 1288 #endif 1289 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1290 upTriFactorT->csrMat->values->data().get(), 1291 upTriFactorT->csrMat->row_offsets->data().get(), 1292 upTriFactorT->csrMat->column_indices->data().get(), 1293 upTriFactorT->solveInfo, 1294 xarray, tempGPU->data().get() 1295 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1296 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1297 #endif 1298 );CHKERRCUSPARSE(stat); 1299 1300 /* Then, solve L */ 1301 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1302 loTriFactorT->csrMat->num_rows, 1303 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1304 loTriFactorT->csrMat->num_entries, 1305 #endif 1306 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1307 loTriFactorT->csrMat->values->data().get(), 1308 loTriFactorT->csrMat->row_offsets->data().get(), 1309 loTriFactorT->csrMat->column_indices->data().get(), 1310 loTriFactorT->solveInfo, 1311 tempGPU->data().get(), xarray 1312 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1313 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1314 #endif 1315 );CHKERRCUSPARSE(stat); 1316 1317 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1318 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1319 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1320 tempGPU->begin()); 1321 1322 /* Copy the temporary to the full solution. */ 1323 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1324 1325 /* restore */ 1326 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1327 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1328 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1329 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1330 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1331 PetscFunctionReturn(0); 1332 } 1333 1334 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1335 { 1336 const PetscScalar *barray; 1337 PetscScalar *xarray; 1338 cusparseStatus_t stat; 1339 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1340 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1341 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1342 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1343 PetscErrorCode ierr; 1344 cudaError_t cerr; 1345 1346 PetscFunctionBegin; 1347 /* Analyze the matrix and create the transpose ... on the fly */ 1348 if (!loTriFactorT && !upTriFactorT) { 1349 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1350 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1351 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1352 } 1353 1354 /* Get the GPU pointers */ 1355 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1356 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1357 1358 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1359 /* First, solve U */ 1360 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1361 upTriFactorT->csrMat->num_rows, 1362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1363 upTriFactorT->csrMat->num_entries, 1364 #endif 1365 &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1366 upTriFactorT->csrMat->values->data().get(), 1367 upTriFactorT->csrMat->row_offsets->data().get(), 1368 upTriFactorT->csrMat->column_indices->data().get(), 1369 upTriFactorT->solveInfo, 1370 barray, tempGPU->data().get() 1371 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1372 ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1373 #endif 1374 );CHKERRCUSPARSE(stat); 1375 1376 /* Then, solve L */ 1377 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1378 loTriFactorT->csrMat->num_rows, 1379 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1380 loTriFactorT->csrMat->num_entries, 1381 #endif 1382 &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1383 loTriFactorT->csrMat->values->data().get(), 1384 loTriFactorT->csrMat->row_offsets->data().get(), 1385 loTriFactorT->csrMat->column_indices->data().get(), 1386 loTriFactorT->solveInfo, 1387 tempGPU->data().get(), xarray 1388 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1389 ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1390 #endif 1391 );CHKERRCUSPARSE(stat); 1392 1393 /* restore */ 1394 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1395 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1396 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1397 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1398 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1399 PetscFunctionReturn(0); 1400 } 1401 1402 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1403 { 1404 const PetscScalar *barray; 1405 PetscScalar *xarray; 1406 thrust::device_ptr<const PetscScalar> bGPU; 1407 thrust::device_ptr<PetscScalar> xGPU; 1408 cusparseStatus_t stat; 1409 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1410 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1411 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1412 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1413 PetscErrorCode ierr; 1414 cudaError_t cerr; 1415 1416 PetscFunctionBegin; 1417 1418 /* Get the GPU pointers */ 1419 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1420 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1421 xGPU = thrust::device_pointer_cast(xarray); 1422 bGPU = thrust::device_pointer_cast(barray); 1423 1424 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1425 /* First, reorder with the row permutation */ 1426 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1427 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1428 tempGPU->begin()); 1429 1430 /* Next, solve L */ 1431 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1432 loTriFactor->csrMat->num_rows, 1433 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1434 loTriFactor->csrMat->num_entries, 1435 #endif 1436 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1437 loTriFactor->csrMat->values->data().get(), 1438 loTriFactor->csrMat->row_offsets->data().get(), 1439 loTriFactor->csrMat->column_indices->data().get(), 1440 loTriFactor->solveInfo, 1441 tempGPU->data().get(), xarray 1442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1443 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1444 #endif 1445 );CHKERRCUSPARSE(stat); 1446 1447 /* Then, solve U */ 1448 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1449 upTriFactor->csrMat->num_rows, 1450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1451 upTriFactor->csrMat->num_entries, 1452 #endif 1453 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1454 upTriFactor->csrMat->values->data().get(), 1455 upTriFactor->csrMat->row_offsets->data().get(), 1456 upTriFactor->csrMat->column_indices->data().get(), 1457 upTriFactor->solveInfo, 1458 xarray, tempGPU->data().get() 1459 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1460 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1461 #endif 1462 );CHKERRCUSPARSE(stat); 1463 1464 /* Last, reorder with the column permutation */ 1465 thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 1466 thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 1467 xGPU); 1468 1469 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1470 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1471 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1472 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1473 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1474 PetscFunctionReturn(0); 1475 } 1476 1477 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1478 { 1479 const PetscScalar *barray; 1480 PetscScalar *xarray; 1481 cusparseStatus_t stat; 1482 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1483 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1484 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1485 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1486 PetscErrorCode ierr; 1487 cudaError_t cerr; 1488 1489 PetscFunctionBegin; 1490 /* Get the GPU pointers */ 1491 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1492 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1493 1494 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1495 /* First, solve L */ 1496 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1497 loTriFactor->csrMat->num_rows, 1498 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1499 loTriFactor->csrMat->num_entries, 1500 #endif 1501 &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1502 loTriFactor->csrMat->values->data().get(), 1503 loTriFactor->csrMat->row_offsets->data().get(), 1504 loTriFactor->csrMat->column_indices->data().get(), 1505 loTriFactor->solveInfo, 1506 barray, tempGPU->data().get() 1507 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1508 ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1509 #endif 1510 );CHKERRCUSPARSE(stat); 1511 1512 /* Next, solve U */ 1513 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1514 upTriFactor->csrMat->num_rows, 1515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1516 upTriFactor->csrMat->num_entries, 1517 #endif 1518 &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1519 upTriFactor->csrMat->values->data().get(), 1520 upTriFactor->csrMat->row_offsets->data().get(), 1521 upTriFactor->csrMat->column_indices->data().get(), 1522 upTriFactor->solveInfo, 1523 tempGPU->data().get(), xarray 1524 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1525 ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1526 #endif 1527 );CHKERRCUSPARSE(stat); 1528 1529 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1530 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1531 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1532 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1533 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 1538 { 1539 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1540 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1541 cudaError_t cerr; 1542 PetscErrorCode ierr; 1543 1544 PetscFunctionBegin; 1545 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 1546 CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 1547 1548 ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1549 cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1550 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1551 ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 1552 ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 1553 A->offloadmask = PETSC_OFFLOAD_BOTH; 1554 } 1555 PetscFunctionReturn(0); 1556 } 1557 1558 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 1559 { 1560 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1561 PetscErrorCode ierr; 1562 1563 PetscFunctionBegin; 1564 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 1565 *array = a->a; 1566 A->offloadmask = PETSC_OFFLOAD_CPU; 1567 PetscFunctionReturn(0); 1568 } 1569 1570 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1571 { 1572 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1573 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 1574 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1575 PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1576 PetscErrorCode ierr; 1577 cusparseStatus_t stat; 1578 cudaError_t err; 1579 1580 PetscFunctionBegin; 1581 if (A->boundtocpu) PetscFunctionReturn(0); 1582 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1583 if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1584 /* Copy values only */ 1585 CsrMatrix *matrix,*matrixT; 1586 matrix = (CsrMatrix*)cusparsestruct->mat->mat; 1587 1588 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1589 matrix->values->assign(a->a, a->a+a->nz); 1590 err = WaitForCUDA();CHKERRCUDA(err); 1591 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1592 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1593 1594 /* Update matT when it was built before */ 1595 if (cusparsestruct->matTranspose) { 1596 cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1597 matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 1598 ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1599 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1600 A->cmap->n, matrix->num_entries, 1601 matrix->values->data().get(), 1602 cusparsestruct->rowoffsets_gpu->data().get(), 1603 matrix->column_indices->data().get(), 1604 matrixT->values->data().get(), 1605 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1606 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1607 CUSPARSE_ACTION_NUMERIC,indexBase, 1608 cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1609 #else 1610 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1611 CUSPARSE_ACTION_NUMERIC, indexBase 1612 #endif 1613 );CHKERRCUSPARSE(stat); 1614 err = WaitForCUDA();CHKERRCUDA(err); 1615 ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1616 } 1617 } else { 1618 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1619 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 1620 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 1621 delete cusparsestruct->workVector; 1622 delete cusparsestruct->rowoffsets_gpu; 1623 try { 1624 if (a->compressedrow.use) { 1625 m = a->compressedrow.nrows; 1626 ii = a->compressedrow.i; 1627 ridx = a->compressedrow.rindex; 1628 } else { 1629 m = A->rmap->n; 1630 ii = a->i; 1631 ridx = NULL; 1632 } 1633 cusparsestruct->nrows = m; 1634 1635 /* create cusparse matrix */ 1636 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1637 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 1638 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 1639 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1640 1641 err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 1642 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1643 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1644 err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1645 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1646 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1647 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1648 1649 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1650 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1651 /* set the matrix */ 1652 CsrMatrix *mat= new CsrMatrix; 1653 mat->num_rows = m; 1654 mat->num_cols = A->cmap->n; 1655 mat->num_entries = a->nz; 1656 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1657 mat->row_offsets->assign(ii, ii + m+1); 1658 1659 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1660 mat->column_indices->assign(a->j, a->j+a->nz); 1661 1662 mat->values = new THRUSTARRAY(a->nz); 1663 mat->values->assign(a->a, a->a+a->nz); 1664 1665 /* assign the pointer */ 1666 matstruct->mat = mat; 1667 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1668 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1669 stat = cusparseCreateCsr(&matstruct->matDescr, 1670 mat->num_rows, mat->num_cols, mat->num_entries, 1671 mat->row_offsets->data().get(), mat->column_indices->data().get(), 1672 mat->values->data().get(), 1673 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1674 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1675 } 1676 #endif 1677 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1678 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1679 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1680 #else 1681 CsrMatrix *mat= new CsrMatrix; 1682 mat->num_rows = m; 1683 mat->num_cols = A->cmap->n; 1684 mat->num_entries = a->nz; 1685 mat->row_offsets = new THRUSTINTARRAY32(m+1); 1686 mat->row_offsets->assign(ii, ii + m+1); 1687 1688 mat->column_indices = new THRUSTINTARRAY32(a->nz); 1689 mat->column_indices->assign(a->j, a->j+a->nz); 1690 1691 mat->values = new THRUSTARRAY(a->nz); 1692 mat->values->assign(a->a, a->a+a->nz); 1693 1694 cusparseHybMat_t hybMat; 1695 stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1696 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1697 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1698 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1699 matstruct->descr, mat->values->data().get(), 1700 mat->row_offsets->data().get(), 1701 mat->column_indices->data().get(), 1702 hybMat, 0, partition);CHKERRCUSPARSE(stat); 1703 /* assign the pointer */ 1704 matstruct->mat = hybMat; 1705 1706 if (mat) { 1707 if (mat->values) delete (THRUSTARRAY*)mat->values; 1708 if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1709 if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1710 delete (CsrMatrix*)mat; 1711 } 1712 #endif 1713 } 1714 1715 /* assign the compressed row indices */ 1716 if (a->compressedrow.use) { 1717 cusparsestruct->workVector = new THRUSTARRAY(m); 1718 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1719 matstruct->cprowIndices->assign(ridx,ridx+m); 1720 tmp = m; 1721 } else { 1722 cusparsestruct->workVector = NULL; 1723 matstruct->cprowIndices = NULL; 1724 tmp = 0; 1725 } 1726 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1727 1728 /* assign the pointer */ 1729 cusparsestruct->mat = matstruct; 1730 } catch(char *ex) { 1731 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1732 } 1733 err = WaitForCUDA();CHKERRCUDA(err); 1734 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1735 cusparsestruct->nonzerostate = A->nonzerostate; 1736 } 1737 A->offloadmask = PETSC_OFFLOAD_BOTH; 1738 } 1739 PetscFunctionReturn(0); 1740 } 1741 1742 struct VecCUDAPlusEquals 1743 { 1744 template <typename Tuple> 1745 __host__ __device__ 1746 void operator()(Tuple t) 1747 { 1748 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1749 } 1750 }; 1751 1752 struct VecCUDAEquals 1753 { 1754 template <typename Tuple> 1755 __host__ __device__ 1756 void operator()(Tuple t) 1757 { 1758 thrust::get<1>(t) = thrust::get<0>(t); 1759 } 1760 }; 1761 1762 struct VecCUDAEqualsReverse 1763 { 1764 template <typename Tuple> 1765 __host__ __device__ 1766 void operator()(Tuple t) 1767 { 1768 thrust::get<0>(t) = thrust::get<1>(t); 1769 } 1770 }; 1771 1772 struct MatMatCusparse { 1773 PetscBool cisdense; 1774 PetscScalar *Bt; 1775 Mat X; 1776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1777 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1778 cusparseDnMatDescr_t matBDescr; 1779 cusparseDnMatDescr_t matCDescr; 1780 size_t spmmBufferSize; 1781 void *spmmBuffer; 1782 PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1783 #endif 1784 }; 1785 1786 static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1787 { 1788 PetscErrorCode ierr; 1789 MatMatCusparse *mmdata = (MatMatCusparse *)data; 1790 cudaError_t cerr; 1791 1792 PetscFunctionBegin; 1793 cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1794 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1795 cusparseStatus_t stat; 1796 if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1797 if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1798 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1799 #endif 1800 ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1801 ierr = PetscFree(data);CHKERRQ(ierr); 1802 PetscFunctionReturn(0); 1803 } 1804 1805 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1806 1807 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1808 { 1809 Mat_Product *product = C->product; 1810 Mat A,B; 1811 PetscInt m,n,blda,clda; 1812 PetscBool flg,biscuda; 1813 Mat_SeqAIJCUSPARSE *cusp; 1814 cusparseStatus_t stat; 1815 cusparseOperation_t opA; 1816 const PetscScalar *barray; 1817 PetscScalar *carray; 1818 PetscErrorCode ierr; 1819 MatMatCusparse *mmdata; 1820 Mat_SeqAIJCUSPARSEMultStruct *mat; 1821 CsrMatrix *csrmat; 1822 cudaError_t cerr; 1823 1824 PetscFunctionBegin; 1825 MatCheckProduct(C,1); 1826 if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1827 mmdata = (MatMatCusparse*)product->data; 1828 A = product->A; 1829 B = product->B; 1830 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1831 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1832 /* currently CopyToGpu does not copy if the matrix is bound to CPU 1833 Instead of silently accepting the wrong answer, I prefer to raise the error */ 1834 if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1835 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1836 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1837 switch (product->type) { 1838 case MATPRODUCT_AB: 1839 case MATPRODUCT_PtAP: 1840 mat = cusp->mat; 1841 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1842 m = A->rmap->n; 1843 n = B->cmap->n; 1844 break; 1845 case MATPRODUCT_AtB: 1846 if (!cusp->transgen) { 1847 mat = cusp->mat; 1848 opA = CUSPARSE_OPERATION_TRANSPOSE; 1849 } else { 1850 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1851 mat = cusp->matTranspose; 1852 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1853 } 1854 m = A->cmap->n; 1855 n = B->cmap->n; 1856 break; 1857 case MATPRODUCT_ABt: 1858 case MATPRODUCT_RARt: 1859 mat = cusp->mat; 1860 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1861 m = A->rmap->n; 1862 n = B->rmap->n; 1863 break; 1864 default: 1865 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1866 } 1867 if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1868 csrmat = (CsrMatrix*)mat->mat; 1869 /* if the user passed a CPU matrix, copy the data to the GPU */ 1870 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1871 if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1872 ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1873 1874 ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1875 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1876 ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1877 ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1878 } else { 1879 ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1880 ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1881 } 1882 1883 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1884 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1885 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1886 /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1887 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1888 if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1889 if (!mmdata->matBDescr) { 1890 stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1891 mmdata->Blda = blda; 1892 } 1893 1894 if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1895 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1896 stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1897 mmdata->Clda = clda; 1898 } 1899 1900 if (!mat->matDescr) { 1901 stat = cusparseCreateCsr(&mat->matDescr, 1902 csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1903 csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1904 csrmat->values->data().get(), 1905 CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1906 CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1907 } 1908 stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1909 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1910 mmdata->matCDescr,cusparse_scalartype, 1911 cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1912 if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1913 cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1914 mmdata->initialized = PETSC_TRUE; 1915 } else { 1916 /* to be safe, always update pointers of the mats */ 1917 stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1918 stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1919 stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1920 } 1921 1922 /* do cusparseSpMM, which supports transpose on B */ 1923 stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1924 mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1925 mmdata->matCDescr,cusparse_scalartype, 1926 cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1927 #else 1928 PetscInt k; 1929 /* cusparseXcsrmm does not support transpose on B */ 1930 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1931 cublasHandle_t cublasv2handle; 1932 cublasStatus_t cerr; 1933 1934 ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1935 cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1936 B->cmap->n,B->rmap->n, 1937 &PETSC_CUSPARSE_ONE ,barray,blda, 1938 &PETSC_CUSPARSE_ZERO,barray,blda, 1939 mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1940 blda = B->cmap->n; 1941 k = B->cmap->n; 1942 } else { 1943 k = B->rmap->n; 1944 } 1945 1946 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1947 stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1948 csrmat->num_entries,mat->alpha_one,mat->descr, 1949 csrmat->values->data().get(), 1950 csrmat->row_offsets->data().get(), 1951 csrmat->column_indices->data().get(), 1952 mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1953 carray,clda);CHKERRCUSPARSE(stat); 1954 #endif 1955 cerr = WaitForCUDA();CHKERRCUDA(cerr); 1956 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1957 ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1958 ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1959 if (product->type == MATPRODUCT_RARt) { 1960 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1961 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1962 } else if (product->type == MATPRODUCT_PtAP) { 1963 ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1964 ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1965 } else { 1966 ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1967 } 1968 if (mmdata->cisdense) { 1969 ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1970 } 1971 if (!biscuda) { 1972 ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1973 } 1974 PetscFunctionReturn(0); 1975 } 1976 1977 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1978 { 1979 Mat_Product *product = C->product; 1980 Mat A,B; 1981 PetscInt m,n; 1982 PetscBool cisdense,flg; 1983 PetscErrorCode ierr; 1984 MatMatCusparse *mmdata; 1985 Mat_SeqAIJCUSPARSE *cusp; 1986 1987 PetscFunctionBegin; 1988 MatCheckProduct(C,1); 1989 if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1990 A = product->A; 1991 B = product->B; 1992 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1993 if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1994 cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1995 if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1996 switch (product->type) { 1997 case MATPRODUCT_AB: 1998 m = A->rmap->n; 1999 n = B->cmap->n; 2000 break; 2001 case MATPRODUCT_AtB: 2002 m = A->cmap->n; 2003 n = B->cmap->n; 2004 break; 2005 case MATPRODUCT_ABt: 2006 m = A->rmap->n; 2007 n = B->rmap->n; 2008 break; 2009 case MATPRODUCT_PtAP: 2010 m = B->cmap->n; 2011 n = B->cmap->n; 2012 break; 2013 case MATPRODUCT_RARt: 2014 m = B->rmap->n; 2015 n = B->rmap->n; 2016 break; 2017 default: 2018 SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2019 } 2020 ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2021 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2022 ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2023 ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2024 2025 /* product data */ 2026 ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2027 mmdata->cisdense = cisdense; 2028 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2029 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2030 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2031 cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2032 } 2033 #endif 2034 /* for these products we need intermediate storage */ 2035 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2036 ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2037 ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2038 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2039 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2040 } else { 2041 ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2042 } 2043 } 2044 C->product->data = mmdata; 2045 C->product->destroy = MatDestroy_MatMatCusparse; 2046 2047 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2048 PetscFunctionReturn(0); 2049 } 2050 2051 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2052 2053 /* handles dense B */ 2054 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2055 { 2056 Mat_Product *product = C->product; 2057 PetscErrorCode ierr; 2058 2059 PetscFunctionBegin; 2060 MatCheckProduct(C,1); 2061 if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2062 if (product->A->boundtocpu) { 2063 ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2064 PetscFunctionReturn(0); 2065 } 2066 switch (product->type) { 2067 case MATPRODUCT_AB: 2068 case MATPRODUCT_AtB: 2069 case MATPRODUCT_ABt: 2070 case MATPRODUCT_PtAP: 2071 case MATPRODUCT_RARt: 2072 C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2073 default: 2074 break; 2075 } 2076 PetscFunctionReturn(0); 2077 } 2078 2079 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2080 { 2081 PetscErrorCode ierr; 2082 2083 PetscFunctionBegin; 2084 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2085 PetscFunctionReturn(0); 2086 } 2087 2088 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2089 { 2090 PetscErrorCode ierr; 2091 2092 PetscFunctionBegin; 2093 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2094 PetscFunctionReturn(0); 2095 } 2096 2097 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2098 { 2099 PetscErrorCode ierr; 2100 2101 PetscFunctionBegin; 2102 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105 2106 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2107 { 2108 PetscErrorCode ierr; 2109 2110 PetscFunctionBegin; 2111 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2112 PetscFunctionReturn(0); 2113 } 2114 2115 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2116 { 2117 PetscErrorCode ierr; 2118 2119 PetscFunctionBegin; 2120 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2121 PetscFunctionReturn(0); 2122 } 2123 2124 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2125 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 2126 { 2127 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2128 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2129 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2130 PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2131 PetscErrorCode ierr; 2132 cudaError_t cerr; 2133 cusparseStatus_t stat; 2134 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2135 PetscBool compressed; 2136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2137 PetscInt nx,ny; 2138 #endif 2139 2140 PetscFunctionBegin; 2141 if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2142 if (!a->nonzerorowcnt) { 2143 if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2144 else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2145 PetscFunctionReturn(0); 2146 } 2147 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 2148 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2149 if (!trans) { 2150 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2151 if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2152 } else { 2153 if (herm || !cusparsestruct->transgen) { 2154 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2155 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2156 } else { 2157 if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2158 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2159 } 2160 } 2161 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2162 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2163 2164 try { 2165 ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2166 if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2167 else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2168 2169 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2170 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2171 /* z = A x + beta y. 2172 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2173 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2174 */ 2175 xptr = xarray; 2176 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2177 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2178 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2179 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2180 allocated to accommodate different uses. So we get the length info directly from mat. 2181 */ 2182 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2183 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2184 nx = mat->num_cols; 2185 ny = mat->num_rows; 2186 } 2187 #endif 2188 } else { 2189 /* z = A^T x + beta y 2190 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2191 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2192 */ 2193 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2194 dptr = zarray; 2195 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2196 if (compressed) { /* Scatter x to work vector */ 2197 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2198 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2199 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2200 VecCUDAEqualsReverse()); 2201 } 2202 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2203 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2204 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2205 nx = mat->num_rows; 2206 ny = mat->num_cols; 2207 } 2208 #endif 2209 } 2210 2211 /* csr_spmv does y = alpha op(A) x + beta y */ 2212 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2213 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2214 if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2215 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2216 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2217 stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2218 stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2219 matstruct->matDescr, 2220 matstruct->cuSpMV[opA].vecXDescr, beta, 2221 matstruct->cuSpMV[opA].vecYDescr, 2222 cusparse_scalartype, 2223 cusparsestruct->spmvAlg, 2224 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2225 cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2226 2227 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2228 } else { 2229 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2230 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2231 stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2232 } 2233 2234 stat = cusparseSpMV(cusparsestruct->handle, opA, 2235 matstruct->alpha_one, 2236 matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2237 matstruct->cuSpMV[opA].vecXDescr, 2238 beta, 2239 matstruct->cuSpMV[opA].vecYDescr, 2240 cusparse_scalartype, 2241 cusparsestruct->spmvAlg, 2242 matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2243 #else 2244 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2245 stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2246 mat->num_rows, mat->num_cols, 2247 mat->num_entries, matstruct->alpha_one, matstruct->descr, 2248 mat->values->data().get(), mat->row_offsets->data().get(), 2249 mat->column_indices->data().get(), xptr, beta, 2250 dptr);CHKERRCUSPARSE(stat); 2251 #endif 2252 } else { 2253 if (cusparsestruct->nrows) { 2254 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2255 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2256 #else 2257 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2258 stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2259 matstruct->alpha_one, matstruct->descr, hybMat, 2260 xptr, beta, 2261 dptr);CHKERRCUSPARSE(stat); 2262 #endif 2263 } 2264 } 2265 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2266 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2267 2268 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2269 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2270 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2271 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2272 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2273 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2274 } 2275 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2276 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 2277 } 2278 2279 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2280 if (compressed) { 2281 thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2282 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2283 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2284 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2285 VecCUDAPlusEquals()); 2286 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2287 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2288 } 2289 } else { 2290 if (yy && yy != zz) { 2291 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2292 } 2293 } 2294 ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2295 if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2296 else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 2297 } catch(char *ex) { 2298 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2299 } 2300 if (yy) { 2301 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2302 } else { 2303 ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2304 } 2305 PetscFunctionReturn(0); 2306 } 2307 2308 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2309 { 2310 PetscErrorCode ierr; 2311 2312 PetscFunctionBegin; 2313 ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2314 PetscFunctionReturn(0); 2315 } 2316 2317 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 2318 { 2319 PetscErrorCode ierr; 2320 PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2321 PetscBool is_seq = PETSC_TRUE; 2322 PetscInt nnz_state = A->nonzerostate; 2323 PetscFunctionBegin; 2324 if (A->factortype == MAT_FACTOR_NONE) { 2325 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2326 } 2327 if (d_mat) { 2328 cudaError_t err; 2329 ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2330 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2331 nnz_state = h_mat.nonzerostate; 2332 is_seq = h_mat.seq; 2333 } 2334 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2335 if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2336 if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2337 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2338 } else if (nnz_state > A->nonzerostate) { 2339 A->offloadmask = PETSC_OFFLOAD_GPU; 2340 } 2341 2342 PetscFunctionReturn(0); 2343 } 2344 2345 /* --------------------------------------------------------------------------------*/ 2346 /*@ 2347 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2348 (the default parallel PETSc format). This matrix will ultimately pushed down 2349 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2350 assembly performance the user should preallocate the matrix storage by setting 2351 the parameter nz (or the array nnz). By setting these parameters accurately, 2352 performance during matrix assembly can be increased by more than a factor of 50. 2353 2354 Collective 2355 2356 Input Parameters: 2357 + comm - MPI communicator, set to PETSC_COMM_SELF 2358 . m - number of rows 2359 . n - number of columns 2360 . nz - number of nonzeros per row (same for all rows) 2361 - nnz - array containing the number of nonzeros in the various rows 2362 (possibly different for each row) or NULL 2363 2364 Output Parameter: 2365 . A - the matrix 2366 2367 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 2368 MatXXXXSetPreallocation() paradgm instead of this routine directly. 2369 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 2370 2371 Notes: 2372 If nnz is given then nz is ignored 2373 2374 The AIJ format (also called the Yale sparse matrix format or 2375 compressed row storage), is fully compatible with standard Fortran 77 2376 storage. That is, the stored row and column indices can begin at 2377 either one (as in Fortran) or zero. See the users' manual for details. 2378 2379 Specify the preallocated storage with either nz or nnz (not both). 2380 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 2381 allocation. For large problems you MUST preallocate memory or you 2382 will get TERRIBLE performance, see the users' manual chapter on matrices. 2383 2384 By default, this format uses inodes (identical nodes) when possible, to 2385 improve numerical efficiency of matrix-vector products and solves. We 2386 search for consecutive rows with the same nonzero structure, thereby 2387 reusing matrix information to achieve increased efficiency. 2388 2389 Level: intermediate 2390 2391 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 2392 @*/ 2393 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 2394 { 2395 PetscErrorCode ierr; 2396 2397 PetscFunctionBegin; 2398 ierr = MatCreate(comm,A);CHKERRQ(ierr); 2399 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 2400 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2401 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 2402 PetscFunctionReturn(0); 2403 } 2404 2405 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 2406 { 2407 PetscErrorCode ierr; 2408 PetscSplitCSRDataStructure *d_mat = NULL; 2409 2410 PetscFunctionBegin; 2411 if (A->factortype == MAT_FACTOR_NONE) { 2412 d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2413 ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2414 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 2415 } else { 2416 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2417 } 2418 if (d_mat) { 2419 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2420 cudaError_t err; 2421 PetscSplitCSRDataStructure h_mat; 2422 ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2423 err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2424 if (h_mat.seq) { 2425 if (a->compressedrow.use) { 2426 err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2427 } 2428 err = cudaFree(d_mat);CHKERRCUDA(err); 2429 } 2430 } 2431 ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2432 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2433 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2434 ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 2435 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2436 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2437 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 2438 PetscFunctionReturn(0); 2439 } 2440 2441 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 2442 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 2443 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 2444 { 2445 PetscErrorCode ierr; 2446 2447 PetscFunctionBegin; 2448 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2449 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 2450 PetscFunctionReturn(0); 2451 } 2452 2453 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 2454 { 2455 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2456 PetscErrorCode ierr; 2457 2458 PetscFunctionBegin; 2459 if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2460 if (flg) { 2461 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2462 2463 A->ops->mult = MatMult_SeqAIJ; 2464 A->ops->multadd = MatMultAdd_SeqAIJ; 2465 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2466 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2467 A->ops->multhermitiantranspose = NULL; 2468 A->ops->multhermitiantransposeadd = NULL; 2469 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2470 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2471 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 2472 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 2473 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 2474 } else { 2475 A->ops->mult = MatMult_SeqAIJCUSPARSE; 2476 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 2477 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 2478 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2479 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2480 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2481 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2482 ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2483 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2484 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 2485 ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 2486 } 2487 A->boundtocpu = flg; 2488 a->inode.use = flg; 2489 PetscFunctionReturn(0); 2490 } 2491 2492 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2493 { 2494 PetscSplitCSRDataStructure *d_mat = NULL; 2495 PetscErrorCode ierr; 2496 PetscBool both = PETSC_FALSE; 2497 2498 PetscFunctionBegin; 2499 if (A->factortype == MAT_FACTOR_NONE) { 2500 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2501 if (spptr->mat) { 2502 CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 2503 if (matrix->values) { 2504 both = PETSC_TRUE; 2505 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2506 } 2507 } 2508 if (spptr->matTranspose) { 2509 CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 2510 if (matrix->values) { 2511 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2512 } 2513 } 2514 d_mat = spptr->deviceMat; 2515 } 2516 if (d_mat) { 2517 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2518 PetscInt n = A->rmap->n, nnz = a->i[n]; 2519 cudaError_t err; 2520 PetscScalar *vals; 2521 ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2522 err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2523 err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2524 } 2525 ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2526 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2527 2528 PetscFunctionReturn(0); 2529 } 2530 2531 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 2532 { 2533 PetscErrorCode ierr; 2534 cusparseStatus_t stat; 2535 Mat B; 2536 2537 PetscFunctionBegin; 2538 if (reuse == MAT_INITIAL_MATRIX) { 2539 ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2540 } else if (reuse == MAT_REUSE_MATRIX) { 2541 ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2542 } 2543 B = *newmat; 2544 2545 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 2546 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 2547 2548 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 2549 if (B->factortype == MAT_FACTOR_NONE) { 2550 Mat_SeqAIJCUSPARSE *spptr; 2551 2552 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2553 spptr->format = MAT_CUSPARSE_CSR; 2554 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2555 B->spptr = spptr; 2556 spptr->deviceMat = NULL; 2557 } else { 2558 Mat_SeqAIJCUSPARSETriFactors *spptr; 2559 2560 ierr = PetscNew(&spptr);CHKERRQ(ierr); 2561 stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2562 B->spptr = spptr; 2563 } 2564 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2565 } 2566 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 2567 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 2568 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 2569 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2570 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2571 B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 2572 2573 ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 2574 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2575 ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 2576 PetscFunctionReturn(0); 2577 } 2578 2579 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 2580 { 2581 PetscErrorCode ierr; 2582 2583 PetscFunctionBegin; 2584 ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 2585 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 2586 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2587 ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2588 ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2589 ierr = PetscOptionsEnd();CHKERRQ(ierr); 2590 PetscFunctionReturn(0); 2591 } 2592 2593 /*MC 2594 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2595 2596 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 2597 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 2598 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2599 2600 Options Database Keys: 2601 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2602 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2603 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2604 2605 Level: beginner 2606 2607 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2608 M*/ 2609 2610 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 2611 2612 2613 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 2614 { 2615 PetscErrorCode ierr; 2616 2617 PetscFunctionBegin; 2618 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2619 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2620 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2621 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 2622 PetscFunctionReturn(0); 2623 } 2624 2625 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 2626 { 2627 PetscErrorCode ierr; 2628 cusparseStatus_t stat; 2629 2630 PetscFunctionBegin; 2631 if (*cusparsestruct) { 2632 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2633 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 2634 delete (*cusparsestruct)->workVector; 2635 delete (*cusparsestruct)->rowoffsets_gpu; 2636 delete (*cusparsestruct)->cooPerm; 2637 delete (*cusparsestruct)->cooPerm_a; 2638 delete (*cusparsestruct)->cooPerm_v; 2639 delete (*cusparsestruct)->cooPerm_w; 2640 if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 2641 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2642 cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2643 #endif 2644 ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 2645 } 2646 PetscFunctionReturn(0); 2647 } 2648 2649 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 2650 { 2651 PetscFunctionBegin; 2652 if (*mat) { 2653 delete (*mat)->values; 2654 delete (*mat)->column_indices; 2655 delete (*mat)->row_offsets; 2656 delete *mat; 2657 *mat = 0; 2658 } 2659 PetscFunctionReturn(0); 2660 } 2661 2662 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 2663 { 2664 cusparseStatus_t stat; 2665 PetscErrorCode ierr; 2666 2667 PetscFunctionBegin; 2668 if (*trifactor) { 2669 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2670 if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 2671 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2672 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2673 cudaError_t cerr; 2674 if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2675 if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2676 #endif 2677 delete *trifactor; 2678 *trifactor = NULL; 2679 } 2680 PetscFunctionReturn(0); 2681 } 2682 2683 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 2684 { 2685 CsrMatrix *mat; 2686 cusparseStatus_t stat; 2687 cudaError_t err; 2688 2689 PetscFunctionBegin; 2690 if (*matstruct) { 2691 if ((*matstruct)->mat) { 2692 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2693 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2694 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2695 #else 2696 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 2697 stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2698 #endif 2699 } else { 2700 mat = (CsrMatrix*)(*matstruct)->mat; 2701 CsrMatrix_Destroy(&mat); 2702 } 2703 } 2704 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 2705 delete (*matstruct)->cprowIndices; 2706 if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 2707 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 2708 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2709 2710 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2711 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2712 if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2713 for (int i=0; i<3; i++) { 2714 if (mdata->cuSpMV[i].initialized) { 2715 err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2716 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2717 stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2718 } 2719 } 2720 #endif 2721 delete *matstruct; 2722 *matstruct = NULL; 2723 } 2724 PetscFunctionReturn(0); 2725 } 2726 2727 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2728 { 2729 PetscErrorCode ierr; 2730 2731 PetscFunctionBegin; 2732 if (*trifactors) { 2733 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2734 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2735 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2736 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 2737 delete (*trifactors)->rpermIndices; 2738 delete (*trifactors)->cpermIndices; 2739 delete (*trifactors)->workVector; 2740 (*trifactors)->rpermIndices = NULL; 2741 (*trifactors)->cpermIndices = NULL; 2742 (*trifactors)->workVector = NULL; 2743 } 2744 PetscFunctionReturn(0); 2745 } 2746 2747 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2748 { 2749 PetscErrorCode ierr; 2750 cusparseHandle_t handle; 2751 cusparseStatus_t stat; 2752 2753 PetscFunctionBegin; 2754 if (*trifactors) { 2755 ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 2756 if (handle = (*trifactors)->handle) { 2757 stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 2758 } 2759 ierr = PetscFree(*trifactors);CHKERRQ(ierr); 2760 } 2761 PetscFunctionReturn(0); 2762 } 2763 2764 struct IJCompare 2765 { 2766 __host__ __device__ 2767 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2768 { 2769 if (t1.get<0>() < t2.get<0>()) return true; 2770 if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 2771 return false; 2772 } 2773 }; 2774 2775 struct IJEqual 2776 { 2777 __host__ __device__ 2778 inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 2779 { 2780 if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 2781 return true; 2782 } 2783 }; 2784 2785 struct IJDiff 2786 { 2787 __host__ __device__ 2788 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2789 { 2790 return t1 == t2 ? 0 : 1; 2791 } 2792 }; 2793 2794 struct IJSum 2795 { 2796 __host__ __device__ 2797 inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 2798 { 2799 return t1||t2; 2800 } 2801 }; 2802 2803 #include <thrust/iterator/discard_iterator.h> 2804 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 2805 { 2806 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2807 CsrMatrix *matrix; 2808 PetscErrorCode ierr; 2809 cudaError_t cerr; 2810 PetscInt n; 2811 2812 PetscFunctionBegin; 2813 if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 2814 if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 2815 if (!cusp->cooPerm) { 2816 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2817 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2818 PetscFunctionReturn(0); 2819 } 2820 n = cusp->cooPerm->size(); 2821 matrix = (CsrMatrix*)cusp->mat->mat; 2822 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2823 if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); } 2824 ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2825 if (v) { 2826 cusp->cooPerm_v->assign(v,v+n); 2827 ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 2828 } 2829 else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.); 2830 if (imode == ADD_VALUES) { 2831 if (cusp->cooPerm_a) { 2832 if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size()); 2833 auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()); 2834 thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 2835 thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 2836 } else { 2837 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2838 matrix->values->begin())); 2839 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2840 matrix->values->end())); 2841 thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 2842 } 2843 } else { 2844 if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence) 2845 if we are inserting two different values into the same location */ 2846 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2847 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin()))); 2848 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2849 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end()))); 2850 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2851 } else { 2852 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()), 2853 matrix->values->begin())); 2854 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()), 2855 matrix->values->end())); 2856 thrust::for_each(zibit,zieit,VecCUDAEquals()); 2857 } 2858 } 2859 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2860 ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 2861 A->offloadmask = PETSC_OFFLOAD_GPU; 2862 ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2863 ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2864 /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2865 ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 2866 PetscFunctionReturn(0); 2867 } 2868 2869 #include <thrust/binary_search.h> 2870 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 2871 { 2872 PetscErrorCode ierr; 2873 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2874 CsrMatrix *matrix; 2875 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2876 PetscInt cooPerm_n, nzr = 0; 2877 cudaError_t cerr; 2878 2879 PetscFunctionBegin; 2880 ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2881 ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 2882 ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 2883 cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 2884 if (n != cooPerm_n) { 2885 delete cusp->cooPerm; 2886 delete cusp->cooPerm_v; 2887 delete cusp->cooPerm_w; 2888 delete cusp->cooPerm_a; 2889 cusp->cooPerm = NULL; 2890 cusp->cooPerm_v = NULL; 2891 cusp->cooPerm_w = NULL; 2892 cusp->cooPerm_a = NULL; 2893 } 2894 if (n) { 2895 THRUSTINTARRAY d_i(n); 2896 THRUSTINTARRAY d_j(n); 2897 THRUSTINTARRAY ii(A->rmap->n); 2898 2899 if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 2900 if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 2901 2902 ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 2903 d_i.assign(coo_i,coo_i+n); 2904 d_j.assign(coo_j,coo_j+n); 2905 auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 2906 auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 2907 2908 thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 2909 thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 2910 *cusp->cooPerm_a = d_i; 2911 THRUSTINTARRAY w = d_j; 2912 2913 auto nekey = thrust::unique(fkey, ekey, IJEqual()); 2914 if (nekey == ekey) { /* all entries are unique */ 2915 delete cusp->cooPerm_a; 2916 cusp->cooPerm_a = NULL; 2917 } else { /* I couldn't come up with a more elegant algorithm */ 2918 adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 2919 adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 2920 (*cusp->cooPerm_a)[0] = 0; 2921 w[0] = 0; 2922 thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 2923 thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 2924 } 2925 thrust::counting_iterator<PetscInt> search_begin(0); 2926 thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 2927 search_begin, search_begin + A->rmap->n, 2928 ii.begin()); 2929 2930 ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 2931 a->singlemalloc = PETSC_FALSE; 2932 a->free_a = PETSC_TRUE; 2933 a->free_ij = PETSC_TRUE; 2934 ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 2935 a->i[0] = 0; 2936 cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2937 a->nz = a->maxnz = a->i[A->rmap->n]; 2938 ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 2939 ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 2940 cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2941 if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 2942 if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 2943 for (PetscInt i = 0; i < A->rmap->n; i++) { 2944 const PetscInt nnzr = a->i[i+1] - a->i[i]; 2945 nzr += (PetscInt)!!(nnzr); 2946 a->ilen[i] = a->imax[i] = nnzr; 2947 } 2948 A->preallocated = PETSC_TRUE; 2949 ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 2950 } else { 2951 ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 2952 } 2953 cerr = WaitForCUDA();CHKERRCUDA(cerr); 2954 ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 2955 2956 /* We want to allocate the CUSPARSE struct for matvec now. 2957 The code is so convoluted now that I prefer to copy garbage to the GPU */ 2958 ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 2959 A->offloadmask = PETSC_OFFLOAD_CPU; 2960 A->nonzerostate++; 2961 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2962 { 2963 matrix = (CsrMatrix*)cusp->mat->mat; 2964 if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 2965 thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 2966 ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 2967 } 2968 2969 A->offloadmask = PETSC_OFFLOAD_CPU; 2970 A->assembled = PETSC_FALSE; 2971 A->was_assembled = PETSC_FALSE; 2972 PetscFunctionReturn(0); 2973 } 2974