1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 #if CUDA_VERSION>=4020 128 switch (op) { 129 case MAT_CUSPARSE_MULT: 130 cusparsestruct->format = format; 131 break; 132 case MAT_CUSPARSE_ALL: 133 cusparsestruct->format = format; 134 break; 135 default: 136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 137 } 138 #else 139 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 140 #endif 141 PetscFunctionReturn(0); 142 } 143 144 /*@ 145 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 146 operation. Only the MatMult operation can use different GPU storage formats 147 for MPIAIJCUSPARSE matrices. 148 Not Collective 149 150 Input Parameters: 151 + A - Matrix of type SEQAIJCUSPARSE 152 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 153 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 154 155 Output Parameter: 156 157 Level: intermediate 158 159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 160 @*/ 161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 162 { 163 PetscErrorCode ierr; 164 165 PetscFunctionBegin; 166 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 167 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 168 PetscFunctionReturn(0); 169 } 170 171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 172 { 173 PetscErrorCode ierr; 174 MatCUSPARSEStorageFormat format; 175 PetscBool flg; 176 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 177 178 PetscFunctionBegin; 179 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 180 if (A->factortype==MAT_FACTOR_NONE) { 181 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 182 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183 if (flg) { 184 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 185 } 186 } 187 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 188 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 189 if (flg) { 190 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 191 } 192 ierr = PetscOptionsTail();CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 195 } 196 197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 198 { 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 203 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 204 PetscFunctionReturn(0); 205 } 206 207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 208 { 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 214 PetscFunctionReturn(0); 215 } 216 217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 218 { 219 PetscErrorCode ierr; 220 221 PetscFunctionBegin; 222 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 223 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 224 PetscFunctionReturn(0); 225 } 226 227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 228 { 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 234 PetscFunctionReturn(0); 235 } 236 237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 238 { 239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 240 PetscInt n = A->rmap->n; 241 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 242 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 243 cusparseStatus_t stat; 244 const PetscInt *ai = a->i,*aj = a->j,*vi; 245 const MatScalar *aa = a->a,*v; 246 PetscInt *AiLo, *AjLo; 247 PetscScalar *AALo; 248 PetscInt i,nz, nzLower, offset, rowOffset; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 253 try { 254 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 255 nzLower=n+ai[n]-ai[1]; 256 257 /* Allocate Space for the lower triangular matrix */ 258 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 259 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 261 262 /* Fill the lower triangular matrix */ 263 AiLo[0] = (PetscInt) 0; 264 AiLo[n] = nzLower; 265 AjLo[0] = (PetscInt) 0; 266 AALo[0] = (MatScalar) 1.0; 267 v = aa; 268 vi = aj; 269 offset = 1; 270 rowOffset= 1; 271 for (i=1; i<n; i++) { 272 nz = ai[i+1] - ai[i]; 273 /* additional 1 for the term on the diagonal */ 274 AiLo[i] = rowOffset; 275 rowOffset += nz+1; 276 277 ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 278 ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 279 280 offset += nz; 281 AjLo[offset] = (PetscInt) i; 282 AALo[offset] = (MatScalar) 1.0; 283 offset += 1; 284 285 v += nz; 286 vi += nz; 287 } 288 289 /* allocate space for the triangular factor information */ 290 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 291 292 /* Create the matrix description */ 293 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 294 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 295 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 296 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 297 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 298 299 /* Create the solve analysis information */ 300 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 301 302 /* set the operation */ 303 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 304 305 /* set the matrix */ 306 loTriFactor->csrMat = new CsrMatrix; 307 loTriFactor->csrMat->num_rows = n; 308 loTriFactor->csrMat->num_cols = n; 309 loTriFactor->csrMat->num_entries = nzLower; 310 311 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 312 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 313 314 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 315 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 316 317 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 318 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 319 320 /* perform the solve analysis */ 321 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 322 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 323 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 324 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 325 326 /* assign the pointer. Is this really necessary? */ 327 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 328 329 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 330 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 332 } catch(char *ex) { 333 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 334 } 335 } 336 PetscFunctionReturn(0); 337 } 338 339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 340 { 341 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 342 PetscInt n = A->rmap->n; 343 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 344 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 345 cusparseStatus_t stat; 346 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 347 const MatScalar *aa = a->a,*v; 348 PetscInt *AiUp, *AjUp; 349 PetscScalar *AAUp; 350 PetscInt i,nz, nzUpper, offset; 351 PetscErrorCode ierr; 352 353 PetscFunctionBegin; 354 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 355 try { 356 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 357 nzUpper = adiag[0]-adiag[n]; 358 359 /* Allocate Space for the upper triangular matrix */ 360 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 362 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 363 364 /* Fill the upper triangular matrix */ 365 AiUp[0]=(PetscInt) 0; 366 AiUp[n]=nzUpper; 367 offset = nzUpper; 368 for (i=n-1; i>=0; i--) { 369 v = aa + adiag[i+1] + 1; 370 vi = aj + adiag[i+1] + 1; 371 372 /* number of elements NOT on the diagonal */ 373 nz = adiag[i] - adiag[i+1]-1; 374 375 /* decrement the offset */ 376 offset -= (nz+1); 377 378 /* first, set the diagonal elements */ 379 AjUp[offset] = (PetscInt) i; 380 AAUp[offset] = (MatScalar)1./v[nz]; 381 AiUp[i] = AiUp[i+1] - (nz+1); 382 383 ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 384 ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 385 } 386 387 /* allocate space for the triangular factor information */ 388 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 389 390 /* Create the matrix description */ 391 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 392 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 393 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 394 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 395 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 396 397 /* Create the solve analysis information */ 398 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 399 400 /* set the operation */ 401 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 402 403 /* set the matrix */ 404 upTriFactor->csrMat = new CsrMatrix; 405 upTriFactor->csrMat->num_rows = n; 406 upTriFactor->csrMat->num_cols = n; 407 upTriFactor->csrMat->num_entries = nzUpper; 408 409 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 410 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 411 412 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 413 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 414 415 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 416 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 417 418 /* perform the solve analysis */ 419 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 420 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 421 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 422 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 423 424 /* assign the pointer. Is this really necessary? */ 425 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 426 427 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 429 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(0); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 442 IS isrow = a->row,iscol = a->icol; 443 PetscBool row_identity,col_identity; 444 const PetscInt *r,*c; 445 PetscInt n = A->rmap->n; 446 447 PetscFunctionBegin; 448 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 449 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 450 451 cusparseTriFactors->workVector = new THRUSTARRAY(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 472 PetscFunctionReturn(0); 473 } 474 475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 476 { 477 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 478 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 479 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 480 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 481 cusparseStatus_t stat; 482 PetscErrorCode ierr; 483 PetscInt *AiUp, *AjUp; 484 PetscScalar *AAUp; 485 PetscScalar *AALo; 486 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 487 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 488 const PetscInt *ai = b->i,*aj = b->j,*vj; 489 const MatScalar *aa = b->a,*v; 490 491 PetscFunctionBegin; 492 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 493 try { 494 /* Allocate Space for the upper triangular matrix */ 495 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 496 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 497 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 498 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 499 500 /* Fill the upper triangular matrix */ 501 AiUp[0]=(PetscInt) 0; 502 AiUp[n]=nzUpper; 503 offset = 0; 504 for (i=0; i<n; i++) { 505 /* set the pointers */ 506 v = aa + ai[i]; 507 vj = aj + ai[i]; 508 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 509 510 /* first, set the diagonal elements */ 511 AjUp[offset] = (PetscInt) i; 512 AAUp[offset] = (MatScalar)1.0/v[nz]; 513 AiUp[i] = offset; 514 AALo[offset] = (MatScalar)1.0/v[nz]; 515 516 offset+=1; 517 if (nz>0) { 518 ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr); 519 ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 520 for (j=offset; j<offset+nz; j++) { 521 AAUp[j] = -AAUp[j]; 522 AALo[j] = AAUp[j]/v[nz]; 523 } 524 offset+=nz; 525 } 526 } 527 528 /* allocate space for the triangular factor information */ 529 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 530 531 /* Create the matrix description */ 532 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 533 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 534 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 535 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 536 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 537 538 /* Create the solve analysis information */ 539 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 540 541 /* set the operation */ 542 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 543 544 /* set the matrix */ 545 upTriFactor->csrMat = new CsrMatrix; 546 upTriFactor->csrMat->num_rows = A->rmap->n; 547 upTriFactor->csrMat->num_cols = A->cmap->n; 548 upTriFactor->csrMat->num_entries = a->nz; 549 550 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 551 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 552 553 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 554 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 555 556 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 557 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 558 559 /* perform the solve analysis */ 560 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 561 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 562 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 563 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 564 565 /* assign the pointer. Is this really necessary? */ 566 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 567 568 /* allocate space for the triangular factor information */ 569 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 570 571 /* Create the matrix description */ 572 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 573 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 574 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 575 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 576 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 577 578 /* Create the solve analysis information */ 579 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 580 581 /* set the operation */ 582 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 583 584 /* set the matrix */ 585 loTriFactor->csrMat = new CsrMatrix; 586 loTriFactor->csrMat->num_rows = A->rmap->n; 587 loTriFactor->csrMat->num_cols = A->cmap->n; 588 loTriFactor->csrMat->num_entries = a->nz; 589 590 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 591 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 592 593 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 594 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 595 596 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 597 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 598 599 /* perform the solve analysis */ 600 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 601 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 602 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 603 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 604 605 /* assign the pointer. Is this really necessary? */ 606 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 607 608 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 609 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 610 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 611 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 612 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 613 } catch(char *ex) { 614 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 615 } 616 } 617 PetscFunctionReturn(0); 618 } 619 620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 621 { 622 PetscErrorCode ierr; 623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 624 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 625 IS ip = a->row; 626 const PetscInt *rip; 627 PetscBool perm_identity; 628 PetscInt n = A->rmap->n; 629 630 PetscFunctionBegin; 631 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 632 cusparseTriFactors->workVector = new THRUSTARRAY(n); 633 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 634 635 /*lower triangular indices */ 636 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 637 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 638 if (!perm_identity) { 639 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 640 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 641 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 642 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 643 } 644 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 645 PetscFunctionReturn(0); 646 } 647 648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 649 { 650 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 651 IS isrow = b->row,iscol = b->col; 652 PetscBool row_identity,col_identity; 653 PetscErrorCode ierr; 654 655 PetscFunctionBegin; 656 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 657 /* determine which version of MatSolve needs to be used. */ 658 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 659 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 660 if (row_identity && col_identity) { 661 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 662 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 663 } else { 664 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 665 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 666 } 667 668 /* get the triangular factors */ 669 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 670 PetscFunctionReturn(0); 671 } 672 673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 674 { 675 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 676 IS ip = b->row; 677 PetscBool perm_identity; 678 PetscErrorCode ierr; 679 680 PetscFunctionBegin; 681 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 682 683 /* determine which version of MatSolve needs to be used. */ 684 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 685 if (perm_identity) { 686 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 687 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 688 } else { 689 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 690 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 691 } 692 693 /* get the triangular factors */ 694 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 695 PetscFunctionReturn(0); 696 } 697 698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 699 { 700 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 701 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 702 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 703 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 704 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 705 cusparseStatus_t stat; 706 cusparseIndexBase_t indexBase; 707 cusparseMatrixType_t matrixType; 708 cusparseFillMode_t fillMode; 709 cusparseDiagType_t diagType; 710 711 PetscFunctionBegin; 712 713 /*********************************************/ 714 /* Now the Transpose of the Lower Tri Factor */ 715 /*********************************************/ 716 717 /* allocate space for the transpose of the lower triangular factor */ 718 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 719 720 /* set the matrix descriptors of the lower triangular factor */ 721 matrixType = cusparseGetMatType(loTriFactor->descr); 722 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 723 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 724 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 725 diagType = cusparseGetMatDiagType(loTriFactor->descr); 726 727 /* Create the matrix description */ 728 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 729 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 730 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 731 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 732 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 733 734 /* Create the solve analysis information */ 735 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 736 737 /* set the operation */ 738 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 739 740 /* allocate GPU space for the CSC of the lower triangular factor*/ 741 loTriFactorT->csrMat = new CsrMatrix; 742 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 743 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 744 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 745 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 746 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 747 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 748 749 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 750 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 751 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 752 loTriFactor->csrMat->values->data().get(), 753 loTriFactor->csrMat->row_offsets->data().get(), 754 loTriFactor->csrMat->column_indices->data().get(), 755 loTriFactorT->csrMat->values->data().get(), 756 loTriFactorT->csrMat->column_indices->data().get(), 757 loTriFactorT->csrMat->row_offsets->data().get(), 758 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 759 760 /* perform the solve analysis on the transposed matrix */ 761 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 762 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 763 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 764 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 765 loTriFactorT->solveInfo);CHKERRCUDA(stat); 766 767 /* assign the pointer. Is this really necessary? */ 768 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 769 770 /*********************************************/ 771 /* Now the Transpose of the Upper Tri Factor */ 772 /*********************************************/ 773 774 /* allocate space for the transpose of the upper triangular factor */ 775 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 776 777 /* set the matrix descriptors of the upper triangular factor */ 778 matrixType = cusparseGetMatType(upTriFactor->descr); 779 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 780 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 781 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 782 diagType = cusparseGetMatDiagType(upTriFactor->descr); 783 784 /* Create the matrix description */ 785 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 786 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 787 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 788 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 789 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 790 791 /* Create the solve analysis information */ 792 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 793 794 /* set the operation */ 795 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 796 797 /* allocate GPU space for the CSC of the upper triangular factor*/ 798 upTriFactorT->csrMat = new CsrMatrix; 799 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 800 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 801 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 802 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 803 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 804 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 805 806 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 807 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 808 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 809 upTriFactor->csrMat->values->data().get(), 810 upTriFactor->csrMat->row_offsets->data().get(), 811 upTriFactor->csrMat->column_indices->data().get(), 812 upTriFactorT->csrMat->values->data().get(), 813 upTriFactorT->csrMat->column_indices->data().get(), 814 upTriFactorT->csrMat->row_offsets->data().get(), 815 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 816 817 /* perform the solve analysis on the transposed matrix */ 818 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 819 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 820 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 821 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 822 upTriFactorT->solveInfo);CHKERRCUDA(stat); 823 824 /* assign the pointer. Is this really necessary? */ 825 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 826 PetscFunctionReturn(0); 827 } 828 829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 830 { 831 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 832 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 833 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 834 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 835 cusparseStatus_t stat; 836 cusparseIndexBase_t indexBase; 837 cudaError_t err; 838 839 PetscFunctionBegin; 840 841 /* allocate space for the triangular factor information */ 842 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 843 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 844 indexBase = cusparseGetMatIndexBase(matstruct->descr); 845 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 846 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 847 848 /* set alpha and beta */ 849 err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 850 err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 851 err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err); 852 err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 853 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 854 855 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 856 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 857 CsrMatrix *matrixT= new CsrMatrix; 858 matrixT->num_rows = A->cmap->n; 859 matrixT->num_cols = A->rmap->n; 860 matrixT->num_entries = a->nz; 861 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 862 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 863 matrixT->values = new THRUSTARRAY(a->nz); 864 865 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 866 indexBase = cusparseGetMatIndexBase(matstruct->descr); 867 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 868 matrix->num_cols, matrix->num_entries, 869 matrix->values->data().get(), 870 matrix->row_offsets->data().get(), 871 matrix->column_indices->data().get(), 872 matrixT->values->data().get(), 873 matrixT->column_indices->data().get(), 874 matrixT->row_offsets->data().get(), 875 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 876 877 /* assign the pointer */ 878 matstructT->mat = matrixT; 879 880 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 881 #if CUDA_VERSION>=5000 882 /* First convert HYB to CSR */ 883 CsrMatrix *temp= new CsrMatrix; 884 temp->num_rows = A->rmap->n; 885 temp->num_cols = A->cmap->n; 886 temp->num_entries = a->nz; 887 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 888 temp->column_indices = new THRUSTINTARRAY32(a->nz); 889 temp->values = new THRUSTARRAY(a->nz); 890 891 892 stat = cusparse_hyb2csr(cusparsestruct->handle, 893 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 894 temp->values->data().get(), 895 temp->row_offsets->data().get(), 896 temp->column_indices->data().get());CHKERRCUDA(stat); 897 898 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 899 CsrMatrix *tempT= new CsrMatrix; 900 tempT->num_rows = A->rmap->n; 901 tempT->num_cols = A->cmap->n; 902 tempT->num_entries = a->nz; 903 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 904 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 905 tempT->values = new THRUSTARRAY(a->nz); 906 907 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 908 temp->num_cols, temp->num_entries, 909 temp->values->data().get(), 910 temp->row_offsets->data().get(), 911 temp->column_indices->data().get(), 912 tempT->values->data().get(), 913 tempT->column_indices->data().get(), 914 tempT->row_offsets->data().get(), 915 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 916 917 /* Last, convert CSC to HYB */ 918 cusparseHybMat_t hybMat; 919 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 920 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 921 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 922 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 923 matstructT->descr, tempT->values->data().get(), 924 tempT->row_offsets->data().get(), 925 tempT->column_indices->data().get(), 926 hybMat, 0, partition);CHKERRCUDA(stat); 927 928 /* assign the pointer */ 929 matstructT->mat = hybMat; 930 931 /* delete temporaries */ 932 if (tempT) { 933 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 934 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 935 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 936 delete (CsrMatrix*) tempT; 937 } 938 if (temp) { 939 if (temp->values) delete (THRUSTARRAY*) temp->values; 940 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 941 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 942 delete (CsrMatrix*) temp; 943 } 944 #else 945 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 946 #endif 947 } 948 /* assign the compressed row indices */ 949 matstructT->cprowIndices = new THRUSTINTARRAY; 950 matstructT->cprowIndices->resize(A->cmap->n); 951 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 952 953 /* assign the pointer */ 954 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 955 PetscFunctionReturn(0); 956 } 957 958 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 959 { 960 PetscInt n = xx->map->n; 961 const PetscScalar *barray; 962 PetscScalar *xarray; 963 thrust::device_ptr<const PetscScalar> bGPU; 964 thrust::device_ptr<PetscScalar> xGPU; 965 cusparseStatus_t stat; 966 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 967 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 968 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 969 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 /* Analyze the matrix and create the transpose ... on the fly */ 974 if (!loTriFactorT && !upTriFactorT) { 975 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 976 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 977 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 978 } 979 980 /* Get the GPU pointers */ 981 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 982 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 983 xGPU = thrust::device_pointer_cast(xarray); 984 bGPU = thrust::device_pointer_cast(barray); 985 986 /* First, reorder with the row permutation */ 987 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 988 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 989 xGPU); 990 991 /* First, solve U */ 992 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 993 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 994 upTriFactorT->csrMat->values->data().get(), 995 upTriFactorT->csrMat->row_offsets->data().get(), 996 upTriFactorT->csrMat->column_indices->data().get(), 997 upTriFactorT->solveInfo, 998 xarray, tempGPU->data().get());CHKERRCUDA(stat); 999 1000 /* Then, solve L */ 1001 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1002 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1003 loTriFactorT->csrMat->values->data().get(), 1004 loTriFactorT->csrMat->row_offsets->data().get(), 1005 loTriFactorT->csrMat->column_indices->data().get(), 1006 loTriFactorT->solveInfo, 1007 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1008 1009 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1010 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1011 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1012 tempGPU->begin()); 1013 1014 /* Copy the temporary to the full solution. */ 1015 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1016 1017 /* restore */ 1018 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1019 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1020 ierr = WaitForGPU();CHKERRCUDA(ierr); 1021 1022 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1023 PetscFunctionReturn(0); 1024 } 1025 1026 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1027 { 1028 const PetscScalar *barray; 1029 PetscScalar *xarray; 1030 cusparseStatus_t stat; 1031 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1032 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1033 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1034 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1035 PetscErrorCode ierr; 1036 1037 PetscFunctionBegin; 1038 /* Analyze the matrix and create the transpose ... on the fly */ 1039 if (!loTriFactorT && !upTriFactorT) { 1040 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1041 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1042 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1043 } 1044 1045 /* Get the GPU pointers */ 1046 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1047 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1048 1049 /* First, solve U */ 1050 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1051 upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr, 1052 upTriFactorT->csrMat->values->data().get(), 1053 upTriFactorT->csrMat->row_offsets->data().get(), 1054 upTriFactorT->csrMat->column_indices->data().get(), 1055 upTriFactorT->solveInfo, 1056 barray, tempGPU->data().get());CHKERRCUDA(stat); 1057 1058 /* Then, solve L */ 1059 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1060 loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr, 1061 loTriFactorT->csrMat->values->data().get(), 1062 loTriFactorT->csrMat->row_offsets->data().get(), 1063 loTriFactorT->csrMat->column_indices->data().get(), 1064 loTriFactorT->solveInfo, 1065 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1066 1067 /* restore */ 1068 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1069 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1070 ierr = WaitForGPU();CHKERRCUDA(ierr); 1071 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1072 PetscFunctionReturn(0); 1073 } 1074 1075 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1076 { 1077 const PetscScalar *barray; 1078 PetscScalar *xarray; 1079 thrust::device_ptr<const PetscScalar> bGPU; 1080 thrust::device_ptr<PetscScalar> xGPU; 1081 cusparseStatus_t stat; 1082 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1083 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1084 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1085 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1086 PetscErrorCode ierr; 1087 1088 PetscFunctionBegin; 1089 1090 /* Get the GPU pointers */ 1091 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1092 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1093 xGPU = thrust::device_pointer_cast(xarray); 1094 bGPU = thrust::device_pointer_cast(barray); 1095 1096 /* First, reorder with the row permutation */ 1097 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1098 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1099 xGPU); 1100 1101 /* Next, solve L */ 1102 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1103 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1104 loTriFactor->csrMat->values->data().get(), 1105 loTriFactor->csrMat->row_offsets->data().get(), 1106 loTriFactor->csrMat->column_indices->data().get(), 1107 loTriFactor->solveInfo, 1108 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1109 1110 /* Then, solve U */ 1111 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1112 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1113 upTriFactor->csrMat->values->data().get(), 1114 upTriFactor->csrMat->row_offsets->data().get(), 1115 upTriFactor->csrMat->column_indices->data().get(), 1116 upTriFactor->solveInfo, 1117 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1118 1119 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1120 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1121 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1122 tempGPU->begin()); 1123 1124 /* Copy the temporary to the full solution. */ 1125 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1126 1127 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1128 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1129 ierr = WaitForGPU();CHKERRCUDA(ierr); 1130 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1131 PetscFunctionReturn(0); 1132 } 1133 1134 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1135 { 1136 const PetscScalar *barray; 1137 PetscScalar *xarray; 1138 cusparseStatus_t stat; 1139 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1140 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1141 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1142 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1143 PetscErrorCode ierr; 1144 1145 PetscFunctionBegin; 1146 /* Get the GPU pointers */ 1147 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1148 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1149 1150 /* First, solve L */ 1151 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1152 loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr, 1153 loTriFactor->csrMat->values->data().get(), 1154 loTriFactor->csrMat->row_offsets->data().get(), 1155 loTriFactor->csrMat->column_indices->data().get(), 1156 loTriFactor->solveInfo, 1157 barray, tempGPU->data().get());CHKERRCUDA(stat); 1158 1159 /* Next, solve U */ 1160 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1161 upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr, 1162 upTriFactor->csrMat->values->data().get(), 1163 upTriFactor->csrMat->row_offsets->data().get(), 1164 upTriFactor->csrMat->column_indices->data().get(), 1165 upTriFactor->solveInfo, 1166 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1167 1168 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1169 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1170 ierr = WaitForGPU();CHKERRCUDA(ierr); 1171 ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1176 { 1177 1178 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1179 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1180 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1181 PetscInt m = A->rmap->n,*ii,*ridx; 1182 PetscErrorCode ierr; 1183 cusparseStatus_t stat; 1184 cudaError_t err; 1185 1186 PetscFunctionBegin; 1187 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1188 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1189 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1190 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1191 /* copy values only */ 1192 matrix->values->assign(a->a, a->a+a->nz); 1193 } else { 1194 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1195 try { 1196 cusparsestruct->nonzerorow=0; 1197 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1198 1199 if (a->compressedrow.use) { 1200 m = a->compressedrow.nrows; 1201 ii = a->compressedrow.i; 1202 ridx = a->compressedrow.rindex; 1203 } else { 1204 /* Forcing compressed row on the GPU */ 1205 int k=0; 1206 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1207 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1208 ii[0]=0; 1209 for (int j = 0; j<m; j++) { 1210 if ((a->i[j+1]-a->i[j])>0) { 1211 ii[k] = a->i[j]; 1212 ridx[k]= j; 1213 k++; 1214 } 1215 } 1216 ii[cusparsestruct->nonzerorow] = a->nz; 1217 m = cusparsestruct->nonzerorow; 1218 } 1219 1220 /* allocate space for the triangular factor information */ 1221 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1222 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1223 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1224 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1225 1226 err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err); 1227 err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1228 err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err); 1229 err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1230 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1231 1232 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1233 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1234 /* set the matrix */ 1235 CsrMatrix *matrix= new CsrMatrix; 1236 matrix->num_rows = m; 1237 matrix->num_cols = A->cmap->n; 1238 matrix->num_entries = a->nz; 1239 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1240 matrix->row_offsets->assign(ii, ii + m+1); 1241 1242 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1243 matrix->column_indices->assign(a->j, a->j+a->nz); 1244 1245 matrix->values = new THRUSTARRAY(a->nz); 1246 matrix->values->assign(a->a, a->a+a->nz); 1247 1248 /* assign the pointer */ 1249 matstruct->mat = matrix; 1250 1251 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1252 #if CUDA_VERSION>=4020 1253 CsrMatrix *matrix= new CsrMatrix; 1254 matrix->num_rows = m; 1255 matrix->num_cols = A->cmap->n; 1256 matrix->num_entries = a->nz; 1257 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1258 matrix->row_offsets->assign(ii, ii + m+1); 1259 1260 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1261 matrix->column_indices->assign(a->j, a->j+a->nz); 1262 1263 matrix->values = new THRUSTARRAY(a->nz); 1264 matrix->values->assign(a->a, a->a+a->nz); 1265 1266 cusparseHybMat_t hybMat; 1267 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1268 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1269 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1270 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1271 matstruct->descr, matrix->values->data().get(), 1272 matrix->row_offsets->data().get(), 1273 matrix->column_indices->data().get(), 1274 hybMat, 0, partition);CHKERRCUDA(stat); 1275 /* assign the pointer */ 1276 matstruct->mat = hybMat; 1277 1278 if (matrix) { 1279 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1280 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1281 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1282 delete (CsrMatrix*)matrix; 1283 } 1284 #endif 1285 } 1286 1287 /* assign the compressed row indices */ 1288 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1289 matstruct->cprowIndices->assign(ridx,ridx+m); 1290 1291 /* assign the pointer */ 1292 cusparsestruct->mat = matstruct; 1293 1294 if (!a->compressedrow.use) { 1295 ierr = PetscFree(ii);CHKERRQ(ierr); 1296 ierr = PetscFree(ridx);CHKERRQ(ierr); 1297 } 1298 cusparsestruct->workVector = new THRUSTARRAY(m); 1299 } catch(char *ex) { 1300 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1301 } 1302 cusparsestruct->nonzerostate = A->nonzerostate; 1303 } 1304 ierr = WaitForGPU();CHKERRCUDA(ierr); 1305 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1306 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1307 } 1308 PetscFunctionReturn(0); 1309 } 1310 1311 struct VecCUDAPlusEquals 1312 { 1313 template <typename Tuple> 1314 __host__ __device__ 1315 void operator()(Tuple t) 1316 { 1317 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1318 } 1319 }; 1320 1321 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1322 { 1323 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1324 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1325 Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */ 1326 const PetscScalar *xarray; 1327 PetscScalar *yarray; 1328 PetscErrorCode ierr; 1329 cusparseStatus_t stat; 1330 1331 PetscFunctionBegin; 1332 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1333 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1334 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1335 ierr = VecSet(yy,0);CHKERRQ(ierr); 1336 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1337 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1338 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1339 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1340 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1341 mat->num_rows, mat->num_cols, mat->num_entries, 1342 matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), 1343 mat->column_indices->data().get(), xarray, matstruct->beta, 1344 yarray);CHKERRCUDA(stat); 1345 } else { 1346 #if CUDA_VERSION>=4020 1347 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1348 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1349 matstruct->alpha, matstruct->descr, hybMat, 1350 xarray, matstruct->beta, 1351 yarray);CHKERRCUDA(stat); 1352 #endif 1353 } 1354 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1355 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1356 if (!cusparsestruct->stream) { 1357 ierr = WaitForGPU();CHKERRCUDA(ierr); 1358 } 1359 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1364 { 1365 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1366 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1367 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1368 const PetscScalar *xarray; 1369 PetscScalar *yarray; 1370 PetscErrorCode ierr; 1371 cusparseStatus_t stat; 1372 1373 PetscFunctionBegin; 1374 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1375 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1376 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1377 if (!matstructT) { 1378 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1379 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1380 } 1381 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1382 ierr = VecSet(yy,0);CHKERRQ(ierr); 1383 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1384 1385 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1386 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1387 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1388 mat->num_rows, mat->num_cols, 1389 mat->num_entries, matstructT->alpha, matstructT->descr, 1390 mat->values->data().get(), mat->row_offsets->data().get(), 1391 mat->column_indices->data().get(), xarray, matstructT->beta, 1392 yarray);CHKERRCUDA(stat); 1393 } else { 1394 #if CUDA_VERSION>=4020 1395 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1396 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1397 matstructT->alpha, matstructT->descr, hybMat, 1398 xarray, matstructT->beta, 1399 yarray);CHKERRCUDA(stat); 1400 #endif 1401 } 1402 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1403 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1404 if (!cusparsestruct->stream) { 1405 ierr = WaitForGPU();CHKERRCUDA(ierr); 1406 } 1407 ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1408 PetscFunctionReturn(0); 1409 } 1410 1411 1412 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1413 { 1414 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1415 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1416 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1417 thrust::device_ptr<PetscScalar> zptr; 1418 const PetscScalar *xarray; 1419 PetscScalar *zarray; 1420 PetscErrorCode ierr; 1421 cusparseStatus_t stat; 1422 1423 PetscFunctionBegin; 1424 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1425 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1426 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1427 try { 1428 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1429 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1430 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1431 zptr = thrust::device_pointer_cast(zarray); 1432 1433 /* multiply add */ 1434 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1435 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1436 /* here we need to be careful to set the number of rows in the multiply to the 1437 number of compressed rows in the matrix ... which is equivalent to the 1438 size of the workVector */ 1439 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1440 mat->num_rows, mat->num_cols, 1441 mat->num_entries, matstruct->alpha, matstruct->descr, 1442 mat->values->data().get(), mat->row_offsets->data().get(), 1443 mat->column_indices->data().get(), xarray, matstruct->beta, 1444 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1445 } else { 1446 #if CUDA_VERSION>=4020 1447 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1448 if (cusparsestruct->workVector->size()) { 1449 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1450 matstruct->alpha, matstruct->descr, hybMat, 1451 xarray, matstruct->beta, 1452 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1453 } 1454 #endif 1455 } 1456 1457 /* scatter the data from the temporary into the full vector with a += operation */ 1458 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1459 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1460 VecCUDAPlusEquals()); 1461 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1462 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1463 1464 } catch(char *ex) { 1465 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1466 } 1467 ierr = WaitForGPU();CHKERRCUDA(ierr); 1468 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1469 PetscFunctionReturn(0); 1470 } 1471 1472 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1473 { 1474 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1475 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1476 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1477 thrust::device_ptr<PetscScalar> zptr; 1478 const PetscScalar *xarray; 1479 PetscScalar *zarray; 1480 PetscErrorCode ierr; 1481 cusparseStatus_t stat; 1482 1483 PetscFunctionBegin; 1484 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1485 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1486 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1487 if (!matstructT) { 1488 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1489 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1490 } 1491 1492 try { 1493 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1494 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1495 ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1496 zptr = thrust::device_pointer_cast(zarray); 1497 1498 /* multiply add with matrix transpose */ 1499 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1500 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1501 /* here we need to be careful to set the number of rows in the multiply to the 1502 number of compressed rows in the matrix ... which is equivalent to the 1503 size of the workVector */ 1504 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1505 mat->num_rows, mat->num_cols, 1506 mat->num_entries, matstructT->alpha, matstructT->descr, 1507 mat->values->data().get(), mat->row_offsets->data().get(), 1508 mat->column_indices->data().get(), xarray, matstructT->beta, 1509 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1510 } else { 1511 #if CUDA_VERSION>=4020 1512 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1513 if (cusparsestruct->workVector->size()) { 1514 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1515 matstructT->alpha, matstructT->descr, hybMat, 1516 xarray, matstructT->beta, 1517 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1518 } 1519 #endif 1520 } 1521 1522 /* scatter the data from the temporary into the full vector with a += operation */ 1523 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1524 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1525 VecCUDAPlusEquals()); 1526 1527 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1528 ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr); 1529 1530 } catch(char *ex) { 1531 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1532 } 1533 ierr = WaitForGPU();CHKERRCUDA(ierr); 1534 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1539 { 1540 PetscErrorCode ierr; 1541 1542 PetscFunctionBegin; 1543 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1544 if (A->factortype==MAT_FACTOR_NONE) { 1545 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1546 } 1547 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1548 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1549 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1550 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1551 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1552 PetscFunctionReturn(0); 1553 } 1554 1555 /* --------------------------------------------------------------------------------*/ 1556 /*@ 1557 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1558 (the default parallel PETSc format). This matrix will ultimately pushed down 1559 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1560 assembly performance the user should preallocate the matrix storage by setting 1561 the parameter nz (or the array nnz). By setting these parameters accurately, 1562 performance during matrix assembly can be increased by more than a factor of 50. 1563 1564 Collective on MPI_Comm 1565 1566 Input Parameters: 1567 + comm - MPI communicator, set to PETSC_COMM_SELF 1568 . m - number of rows 1569 . n - number of columns 1570 . nz - number of nonzeros per row (same for all rows) 1571 - nnz - array containing the number of nonzeros in the various rows 1572 (possibly different for each row) or NULL 1573 1574 Output Parameter: 1575 . A - the matrix 1576 1577 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1578 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1579 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1580 1581 Notes: 1582 If nnz is given then nz is ignored 1583 1584 The AIJ format (also called the Yale sparse matrix format or 1585 compressed row storage), is fully compatible with standard Fortran 77 1586 storage. That is, the stored row and column indices can begin at 1587 either one (as in Fortran) or zero. See the users' manual for details. 1588 1589 Specify the preallocated storage with either nz or nnz (not both). 1590 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1591 allocation. For large problems you MUST preallocate memory or you 1592 will get TERRIBLE performance, see the users' manual chapter on matrices. 1593 1594 By default, this format uses inodes (identical nodes) when possible, to 1595 improve numerical efficiency of matrix-vector products and solves. We 1596 search for consecutive rows with the same nonzero structure, thereby 1597 reusing matrix information to achieve increased efficiency. 1598 1599 Level: intermediate 1600 1601 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1602 @*/ 1603 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1604 { 1605 PetscErrorCode ierr; 1606 1607 PetscFunctionBegin; 1608 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1609 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1610 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1611 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1612 PetscFunctionReturn(0); 1613 } 1614 1615 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1616 { 1617 PetscErrorCode ierr; 1618 1619 PetscFunctionBegin; 1620 if (A->factortype==MAT_FACTOR_NONE) { 1621 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1622 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1623 } 1624 } else { 1625 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1626 } 1627 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1628 PetscFunctionReturn(0); 1629 } 1630 1631 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1632 { 1633 PetscErrorCode ierr; 1634 Mat C; 1635 cusparseStatus_t stat; 1636 cusparseHandle_t handle=0; 1637 1638 PetscFunctionBegin; 1639 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1640 C = *B; 1641 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1642 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1643 1644 /* inject CUSPARSE-specific stuff */ 1645 if (C->factortype==MAT_FACTOR_NONE) { 1646 /* you cannot check the inode.use flag here since the matrix was just created. 1647 now build a GPU matrix data structure */ 1648 C->spptr = new Mat_SeqAIJCUSPARSE; 1649 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1650 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1651 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1652 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1653 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1654 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = 0; 1655 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1656 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1657 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1658 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1659 } else { 1660 /* NEXT, set the pointers to the triangular factors */ 1661 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1662 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1663 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1664 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1665 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1666 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1667 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1668 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1669 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1670 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1671 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1672 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1673 } 1674 1675 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1676 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1677 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1678 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1679 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1680 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1681 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1682 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1683 1684 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1685 1686 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1687 1688 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1689 PetscFunctionReturn(0); 1690 } 1691 1692 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1693 { 1694 PetscErrorCode ierr; 1695 cusparseStatus_t stat; 1696 cusparseHandle_t handle=0; 1697 1698 PetscFunctionBegin; 1699 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1700 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1701 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1702 1703 if (B->factortype==MAT_FACTOR_NONE) { 1704 /* you cannot check the inode.use flag here since the matrix was just created. 1705 now build a GPU matrix data structure */ 1706 B->spptr = new Mat_SeqAIJCUSPARSE; 1707 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1708 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1709 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1710 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1711 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1712 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = 0; 1713 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1714 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1715 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1716 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1717 } else { 1718 /* NEXT, set the pointers to the triangular factors */ 1719 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1720 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1721 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1722 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1723 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1724 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1725 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1726 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1727 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = 0; 1728 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1729 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1730 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1731 } 1732 1733 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1734 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1735 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1736 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1737 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1738 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1739 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1740 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1741 1742 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1743 1744 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1745 1746 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1747 PetscFunctionReturn(0); 1748 } 1749 1750 /*MC 1751 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1752 1753 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1754 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1755 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1756 1757 Options Database Keys: 1758 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1759 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1760 . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1761 1762 Level: beginner 1763 1764 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1765 M*/ 1766 1767 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1768 1769 1770 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1771 { 1772 PetscErrorCode ierr; 1773 1774 PetscFunctionBegin; 1775 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1776 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1777 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1778 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1779 PetscFunctionReturn(0); 1780 } 1781 1782 1783 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1784 { 1785 cusparseStatus_t stat; 1786 cusparseHandle_t handle; 1787 1788 PetscFunctionBegin; 1789 if (*cusparsestruct) { 1790 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1791 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1792 delete (*cusparsestruct)->workVector; 1793 if (handle = (*cusparsestruct)->handle) { 1794 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1795 } 1796 delete *cusparsestruct; 1797 *cusparsestruct = 0; 1798 } 1799 PetscFunctionReturn(0); 1800 } 1801 1802 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1803 { 1804 PetscFunctionBegin; 1805 if (*mat) { 1806 delete (*mat)->values; 1807 delete (*mat)->column_indices; 1808 delete (*mat)->row_offsets; 1809 delete *mat; 1810 *mat = 0; 1811 } 1812 PetscFunctionReturn(0); 1813 } 1814 1815 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1816 { 1817 cusparseStatus_t stat; 1818 PetscErrorCode ierr; 1819 1820 PetscFunctionBegin; 1821 if (*trifactor) { 1822 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1823 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1824 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1825 delete *trifactor; 1826 *trifactor = 0; 1827 } 1828 PetscFunctionReturn(0); 1829 } 1830 1831 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1832 { 1833 CsrMatrix *mat; 1834 cusparseStatus_t stat; 1835 cudaError_t err; 1836 1837 PetscFunctionBegin; 1838 if (*matstruct) { 1839 if ((*matstruct)->mat) { 1840 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1841 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1842 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1843 } else { 1844 mat = (CsrMatrix*)(*matstruct)->mat; 1845 CsrMatrix_Destroy(&mat); 1846 } 1847 } 1848 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1849 delete (*matstruct)->cprowIndices; 1850 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1851 if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); } 1852 delete *matstruct; 1853 *matstruct = 0; 1854 } 1855 PetscFunctionReturn(0); 1856 } 1857 1858 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1859 { 1860 cusparseHandle_t handle; 1861 cusparseStatus_t stat; 1862 1863 PetscFunctionBegin; 1864 if (*trifactors) { 1865 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1866 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1867 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1868 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1869 delete (*trifactors)->rpermIndices; 1870 delete (*trifactors)->cpermIndices; 1871 delete (*trifactors)->workVector; 1872 if (handle = (*trifactors)->handle) { 1873 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1874 } 1875 delete *trifactors; 1876 *trifactors = 0; 1877 } 1878 PetscFunctionReturn(0); 1879 } 1880 1881