1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 #define PETSC_SKIP_CXX_COMPLEX_FIX 7 8 #include <petscconf.h> 9 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10 #include <../src/mat/impls/sbaij/seq/sbaij.h> 11 #include <../src/vec/vec/impls/dvecimpl.h> 12 #include <petsc/private/vecimpl.h> 13 #undef VecType 14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15 16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 17 18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 21 22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 25 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 35 36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 41 42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 43 { 44 cusparseStatus_t stat; 45 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 46 47 PetscFunctionBegin; 48 cusparsestruct->stream = stream; 49 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 50 PetscFunctionReturn(0); 51 } 52 53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 54 { 55 cusparseStatus_t stat; 56 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 57 58 PetscFunctionBegin; 59 if (cusparsestruct->handle != handle) { 60 if (cusparsestruct->handle) { 61 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 62 } 63 cusparsestruct->handle = handle; 64 } 65 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 66 PetscFunctionReturn(0); 67 } 68 69 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 70 { 71 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 72 PetscFunctionBegin; 73 if (cusparsestruct->handle) 74 cusparsestruct->handle = 0; 75 PetscFunctionReturn(0); 76 } 77 78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 79 { 80 PetscFunctionBegin; 81 *type = MATSOLVERCUSPARSE; 82 PetscFunctionReturn(0); 83 } 84 85 /*MC 86 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 87 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 88 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 89 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 90 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 91 algorithms are not recommended. This class does NOT support direct solver operations. 92 93 Level: beginner 94 95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 96 M*/ 97 98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 99 { 100 PetscErrorCode ierr; 101 PetscInt n = A->rmap->n; 102 103 PetscFunctionBegin; 104 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 105 (*B)->factortype = ftype; 106 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 107 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 108 109 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 110 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 111 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 112 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 113 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 114 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 115 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 116 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 117 118 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 119 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 120 PetscFunctionReturn(0); 121 } 122 123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 124 { 125 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 126 127 PetscFunctionBegin; 128 #if CUDA_VERSION>=4020 129 switch (op) { 130 case MAT_CUSPARSE_MULT: 131 cusparsestruct->format = format; 132 break; 133 case MAT_CUSPARSE_ALL: 134 cusparsestruct->format = format; 135 break; 136 default: 137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 138 } 139 #else 140 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later."); 141 #endif 142 PetscFunctionReturn(0); 143 } 144 145 /*@ 146 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 147 operation. Only the MatMult operation can use different GPU storage formats 148 for MPIAIJCUSPARSE matrices. 149 Not Collective 150 151 Input Parameters: 152 + A - Matrix of type SEQAIJCUSPARSE 153 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 154 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 155 156 Output Parameter: 157 158 Level: intermediate 159 160 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 161 @*/ 162 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 163 { 164 PetscErrorCode ierr; 165 166 PetscFunctionBegin; 167 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 168 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 169 PetscFunctionReturn(0); 170 } 171 172 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 173 { 174 PetscErrorCode ierr; 175 MatCUSPARSEStorageFormat format; 176 PetscBool flg; 177 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 178 179 PetscFunctionBegin; 180 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 181 if (A->factortype==MAT_FACTOR_NONE) { 182 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 183 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 184 if (flg) { 185 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 186 } 187 } 188 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 189 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 190 if (flg) { 191 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 192 } 193 ierr = PetscOptionsTail();CHKERRQ(ierr); 194 PetscFunctionReturn(0); 195 196 } 197 198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 199 { 200 PetscErrorCode ierr; 201 202 PetscFunctionBegin; 203 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 204 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 205 PetscFunctionReturn(0); 206 } 207 208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 209 { 210 PetscErrorCode ierr; 211 212 PetscFunctionBegin; 213 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 214 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 215 PetscFunctionReturn(0); 216 } 217 218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 219 { 220 PetscErrorCode ierr; 221 222 PetscFunctionBegin; 223 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 224 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 225 PetscFunctionReturn(0); 226 } 227 228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 229 { 230 PetscErrorCode ierr; 231 232 PetscFunctionBegin; 233 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 234 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 239 { 240 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 241 PetscInt n = A->rmap->n; 242 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 243 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 244 cusparseStatus_t stat; 245 const PetscInt *ai = a->i,*aj = a->j,*vi; 246 const MatScalar *aa = a->a,*v; 247 PetscInt *AiLo, *AjLo; 248 PetscScalar *AALo; 249 PetscInt i,nz, nzLower, offset, rowOffset; 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 254 try { 255 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 256 nzLower=n+ai[n]-ai[1]; 257 258 /* Allocate Space for the lower triangular matrix */ 259 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 260 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 261 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 262 263 /* Fill the lower triangular matrix */ 264 AiLo[0] = (PetscInt) 0; 265 AiLo[n] = nzLower; 266 AjLo[0] = (PetscInt) 0; 267 AALo[0] = (MatScalar) 1.0; 268 v = aa; 269 vi = aj; 270 offset = 1; 271 rowOffset= 1; 272 for (i=1; i<n; i++) { 273 nz = ai[i+1] - ai[i]; 274 /* additional 1 for the term on the diagonal */ 275 AiLo[i] = rowOffset; 276 rowOffset += nz+1; 277 278 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 279 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 280 281 offset += nz; 282 AjLo[offset] = (PetscInt) i; 283 AALo[offset] = (MatScalar) 1.0; 284 offset += 1; 285 286 v += nz; 287 vi += nz; 288 } 289 290 /* allocate space for the triangular factor information */ 291 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 292 293 /* Create the matrix description */ 294 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 295 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 296 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 297 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 298 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 299 300 /* Create the solve analysis information */ 301 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 302 303 /* set the operation */ 304 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 305 306 /* set the matrix */ 307 loTriFactor->csrMat = new CsrMatrix; 308 loTriFactor->csrMat->num_rows = n; 309 loTriFactor->csrMat->num_cols = n; 310 loTriFactor->csrMat->num_entries = nzLower; 311 312 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 313 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 314 315 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 316 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 317 318 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 319 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 320 321 /* perform the solve analysis */ 322 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 323 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 324 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 325 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 326 327 /* assign the pointer. Is this really necessary? */ 328 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 329 330 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 331 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 332 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 333 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 334 } catch(char *ex) { 335 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 336 } 337 } 338 PetscFunctionReturn(0); 339 } 340 341 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 342 { 343 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 344 PetscInt n = A->rmap->n; 345 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 346 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 347 cusparseStatus_t stat; 348 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 349 const MatScalar *aa = a->a,*v; 350 PetscInt *AiUp, *AjUp; 351 PetscScalar *AAUp; 352 PetscInt i,nz, nzUpper, offset; 353 PetscErrorCode ierr; 354 355 PetscFunctionBegin; 356 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 357 try { 358 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 359 nzUpper = adiag[0]-adiag[n]; 360 361 /* Allocate Space for the upper triangular matrix */ 362 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 363 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 364 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 365 366 /* Fill the upper triangular matrix */ 367 AiUp[0]=(PetscInt) 0; 368 AiUp[n]=nzUpper; 369 offset = nzUpper; 370 for (i=n-1; i>=0; i--) { 371 v = aa + adiag[i+1] + 1; 372 vi = aj + adiag[i+1] + 1; 373 374 /* number of elements NOT on the diagonal */ 375 nz = adiag[i] - adiag[i+1]-1; 376 377 /* decrement the offset */ 378 offset -= (nz+1); 379 380 /* first, set the diagonal elements */ 381 AjUp[offset] = (PetscInt) i; 382 AAUp[offset] = (MatScalar)1./v[nz]; 383 AiUp[i] = AiUp[i+1] - (nz+1); 384 385 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 386 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 387 } 388 389 /* allocate space for the triangular factor information */ 390 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 391 392 /* Create the matrix description */ 393 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 394 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 395 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 396 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 397 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 398 399 /* Create the solve analysis information */ 400 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 401 402 /* set the operation */ 403 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 404 405 /* set the matrix */ 406 upTriFactor->csrMat = new CsrMatrix; 407 upTriFactor->csrMat->num_rows = n; 408 upTriFactor->csrMat->num_cols = n; 409 upTriFactor->csrMat->num_entries = nzUpper; 410 411 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 412 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 413 414 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 415 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 416 417 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 418 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 419 420 /* perform the solve analysis */ 421 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 422 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 423 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 424 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 425 426 /* assign the pointer. Is this really necessary? */ 427 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 428 429 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 430 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 431 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 432 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 433 } catch(char *ex) { 434 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 435 } 436 } 437 PetscFunctionReturn(0); 438 } 439 440 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 441 { 442 PetscErrorCode ierr; 443 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 444 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 445 IS isrow = a->row,iscol = a->icol; 446 PetscBool row_identity,col_identity; 447 const PetscInt *r,*c; 448 PetscInt n = A->rmap->n; 449 450 PetscFunctionBegin; 451 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 452 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 453 454 cusparseTriFactors->workVector = new THRUSTARRAY(n); 455 cusparseTriFactors->nnz=a->nz; 456 457 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 458 /*lower triangular indices */ 459 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 460 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 461 if (!row_identity) { 462 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 463 cusparseTriFactors->rpermIndices->assign(r, r+n); 464 } 465 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 466 467 /*upper triangular indices */ 468 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 469 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 470 if (!col_identity) { 471 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 472 cusparseTriFactors->cpermIndices->assign(c, c+n); 473 } 474 475 if(!row_identity && !col_identity) { 476 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 477 } else if(!row_identity) { 478 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 479 } else if(!col_identity) { 480 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 481 } 482 483 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 484 PetscFunctionReturn(0); 485 } 486 487 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 488 { 489 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 490 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 491 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 492 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 493 cusparseStatus_t stat; 494 PetscErrorCode ierr; 495 PetscInt *AiUp, *AjUp; 496 PetscScalar *AAUp; 497 PetscScalar *AALo; 498 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 499 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 500 const PetscInt *ai = b->i,*aj = b->j,*vj; 501 const MatScalar *aa = b->a,*v; 502 503 PetscFunctionBegin; 504 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 505 try { 506 /* Allocate Space for the upper triangular matrix */ 507 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 509 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 510 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 511 512 /* Fill the upper triangular matrix */ 513 AiUp[0]=(PetscInt) 0; 514 AiUp[n]=nzUpper; 515 offset = 0; 516 for (i=0; i<n; i++) { 517 /* set the pointers */ 518 v = aa + ai[i]; 519 vj = aj + ai[i]; 520 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 521 522 /* first, set the diagonal elements */ 523 AjUp[offset] = (PetscInt) i; 524 AAUp[offset] = (MatScalar)1.0/v[nz]; 525 AiUp[i] = offset; 526 AALo[offset] = (MatScalar)1.0/v[nz]; 527 528 offset+=1; 529 if (nz>0) { 530 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 531 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 532 for (j=offset; j<offset+nz; j++) { 533 AAUp[j] = -AAUp[j]; 534 AALo[j] = AAUp[j]/v[nz]; 535 } 536 offset+=nz; 537 } 538 } 539 540 /* allocate space for the triangular factor information */ 541 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 542 543 /* Create the matrix description */ 544 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 545 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 546 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 547 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 548 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 549 550 /* Create the solve analysis information */ 551 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 552 553 /* set the operation */ 554 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 555 556 /* set the matrix */ 557 upTriFactor->csrMat = new CsrMatrix; 558 upTriFactor->csrMat->num_rows = A->rmap->n; 559 upTriFactor->csrMat->num_cols = A->cmap->n; 560 upTriFactor->csrMat->num_entries = a->nz; 561 562 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 563 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 564 565 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 566 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 567 568 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 569 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 570 571 /* perform the solve analysis */ 572 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 573 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 574 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 575 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 576 577 /* assign the pointer. Is this really necessary? */ 578 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 579 580 /* allocate space for the triangular factor information */ 581 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 582 583 /* Create the matrix description */ 584 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 585 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 586 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 587 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 588 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 589 590 /* Create the solve analysis information */ 591 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 592 593 /* set the operation */ 594 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 595 596 /* set the matrix */ 597 loTriFactor->csrMat = new CsrMatrix; 598 loTriFactor->csrMat->num_rows = A->rmap->n; 599 loTriFactor->csrMat->num_cols = A->cmap->n; 600 loTriFactor->csrMat->num_entries = a->nz; 601 602 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 603 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 604 605 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 606 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 607 608 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 609 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 610 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 611 612 /* perform the solve analysis */ 613 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 614 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 615 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 616 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 617 618 /* assign the pointer. Is this really necessary? */ 619 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 620 621 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 622 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 624 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 625 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 626 } catch(char *ex) { 627 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 628 } 629 } 630 PetscFunctionReturn(0); 631 } 632 633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 634 { 635 PetscErrorCode ierr; 636 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 637 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 638 IS ip = a->row; 639 const PetscInt *rip; 640 PetscBool perm_identity; 641 PetscInt n = A->rmap->n; 642 643 PetscFunctionBegin; 644 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 645 cusparseTriFactors->workVector = new THRUSTARRAY(n); 646 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 647 648 /*lower triangular indices */ 649 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 650 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 651 if (!perm_identity) { 652 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 653 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 654 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 655 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 656 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 657 } 658 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 663 { 664 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 665 IS isrow = b->row,iscol = b->col; 666 PetscBool row_identity,col_identity; 667 PetscErrorCode ierr; 668 669 PetscFunctionBegin; 670 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 671 /* determine which version of MatSolve needs to be used. */ 672 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 673 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 674 if (row_identity && col_identity) { 675 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 676 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 677 } else { 678 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 679 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 680 } 681 682 /* get the triangular factors */ 683 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 688 { 689 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 690 IS ip = b->row; 691 PetscBool perm_identity; 692 PetscErrorCode ierr; 693 694 PetscFunctionBegin; 695 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 696 697 /* determine which version of MatSolve needs to be used. */ 698 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 699 if (perm_identity) { 700 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 701 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 702 } else { 703 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 704 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 705 } 706 707 /* get the triangular factors */ 708 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 709 PetscFunctionReturn(0); 710 } 711 712 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 713 { 714 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 715 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 716 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 717 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 718 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 719 cusparseStatus_t stat; 720 cusparseIndexBase_t indexBase; 721 cusparseMatrixType_t matrixType; 722 cusparseFillMode_t fillMode; 723 cusparseDiagType_t diagType; 724 725 PetscFunctionBegin; 726 727 /*********************************************/ 728 /* Now the Transpose of the Lower Tri Factor */ 729 /*********************************************/ 730 731 /* allocate space for the transpose of the lower triangular factor */ 732 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 733 734 /* set the matrix descriptors of the lower triangular factor */ 735 matrixType = cusparseGetMatType(loTriFactor->descr); 736 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 737 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 738 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 739 diagType = cusparseGetMatDiagType(loTriFactor->descr); 740 741 /* Create the matrix description */ 742 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 743 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 744 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 745 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 746 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 747 748 /* Create the solve analysis information */ 749 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 750 751 /* set the operation */ 752 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 753 754 /* allocate GPU space for the CSC of the lower triangular factor*/ 755 loTriFactorT->csrMat = new CsrMatrix; 756 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 757 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 758 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 759 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 760 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 761 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 762 763 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 764 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 765 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 766 loTriFactor->csrMat->values->data().get(), 767 loTriFactor->csrMat->row_offsets->data().get(), 768 loTriFactor->csrMat->column_indices->data().get(), 769 loTriFactorT->csrMat->values->data().get(), 770 loTriFactorT->csrMat->column_indices->data().get(), 771 loTriFactorT->csrMat->row_offsets->data().get(), 772 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 773 774 /* perform the solve analysis on the transposed matrix */ 775 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 776 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 777 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 778 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 779 loTriFactorT->solveInfo);CHKERRCUDA(stat); 780 781 /* assign the pointer. Is this really necessary? */ 782 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 783 784 /*********************************************/ 785 /* Now the Transpose of the Upper Tri Factor */ 786 /*********************************************/ 787 788 /* allocate space for the transpose of the upper triangular factor */ 789 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 790 791 /* set the matrix descriptors of the upper triangular factor */ 792 matrixType = cusparseGetMatType(upTriFactor->descr); 793 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 794 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 795 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 796 diagType = cusparseGetMatDiagType(upTriFactor->descr); 797 798 /* Create the matrix description */ 799 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 800 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 801 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 802 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 803 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 804 805 /* Create the solve analysis information */ 806 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 807 808 /* set the operation */ 809 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 810 811 /* allocate GPU space for the CSC of the upper triangular factor*/ 812 upTriFactorT->csrMat = new CsrMatrix; 813 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 814 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 815 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 816 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 817 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 818 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 819 820 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 821 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 822 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 823 upTriFactor->csrMat->values->data().get(), 824 upTriFactor->csrMat->row_offsets->data().get(), 825 upTriFactor->csrMat->column_indices->data().get(), 826 upTriFactorT->csrMat->values->data().get(), 827 upTriFactorT->csrMat->column_indices->data().get(), 828 upTriFactorT->csrMat->row_offsets->data().get(), 829 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 830 831 /* perform the solve analysis on the transposed matrix */ 832 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 833 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 834 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 835 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 836 upTriFactorT->solveInfo);CHKERRCUDA(stat); 837 838 /* assign the pointer. Is this really necessary? */ 839 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 840 PetscFunctionReturn(0); 841 } 842 843 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 844 { 845 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 846 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 847 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 848 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 849 cusparseStatus_t stat; 850 cusparseIndexBase_t indexBase; 851 cudaError_t err; 852 PetscErrorCode ierr; 853 854 PetscFunctionBegin; 855 856 /* allocate space for the triangular factor information */ 857 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 858 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 859 indexBase = cusparseGetMatIndexBase(matstruct->descr); 860 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 861 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 862 863 /* set alpha and beta */ 864 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 865 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 866 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 867 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 868 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 869 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 870 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 871 872 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 873 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 874 CsrMatrix *matrixT= new CsrMatrix; 875 matrixT->num_rows = A->cmap->n; 876 matrixT->num_cols = A->rmap->n; 877 matrixT->num_entries = a->nz; 878 matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 879 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 880 matrixT->values = new THRUSTARRAY(a->nz); 881 882 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 883 indexBase = cusparseGetMatIndexBase(matstruct->descr); 884 stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows, 885 matrix->num_cols, matrix->num_entries, 886 matrix->values->data().get(), 887 matrix->row_offsets->data().get(), 888 matrix->column_indices->data().get(), 889 matrixT->values->data().get(), 890 matrixT->column_indices->data().get(), 891 matrixT->row_offsets->data().get(), 892 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 893 894 /* assign the pointer */ 895 matstructT->mat = matrixT; 896 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 897 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 898 #if CUDA_VERSION>=5000 899 /* First convert HYB to CSR */ 900 CsrMatrix *temp= new CsrMatrix; 901 temp->num_rows = A->rmap->n; 902 temp->num_cols = A->cmap->n; 903 temp->num_entries = a->nz; 904 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 905 temp->column_indices = new THRUSTINTARRAY32(a->nz); 906 temp->values = new THRUSTARRAY(a->nz); 907 908 909 stat = cusparse_hyb2csr(cusparsestruct->handle, 910 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 911 temp->values->data().get(), 912 temp->row_offsets->data().get(), 913 temp->column_indices->data().get());CHKERRCUDA(stat); 914 915 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 916 CsrMatrix *tempT= new CsrMatrix; 917 tempT->num_rows = A->rmap->n; 918 tempT->num_cols = A->cmap->n; 919 tempT->num_entries = a->nz; 920 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 921 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 922 tempT->values = new THRUSTARRAY(a->nz); 923 924 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 925 temp->num_cols, temp->num_entries, 926 temp->values->data().get(), 927 temp->row_offsets->data().get(), 928 temp->column_indices->data().get(), 929 tempT->values->data().get(), 930 tempT->column_indices->data().get(), 931 tempT->row_offsets->data().get(), 932 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 933 934 /* Last, convert CSC to HYB */ 935 cusparseHybMat_t hybMat; 936 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 937 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 938 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 939 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 940 matstructT->descr, tempT->values->data().get(), 941 tempT->row_offsets->data().get(), 942 tempT->column_indices->data().get(), 943 hybMat, 0, partition);CHKERRCUDA(stat); 944 945 /* assign the pointer */ 946 matstructT->mat = hybMat; 947 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 948 949 /* delete temporaries */ 950 if (tempT) { 951 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 952 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 953 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 954 delete (CsrMatrix*) tempT; 955 } 956 if (temp) { 957 if (temp->values) delete (THRUSTARRAY*) temp->values; 958 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 959 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 960 delete (CsrMatrix*) temp; 961 } 962 #else 963 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later."); 964 #endif 965 } 966 /* assign the compressed row indices */ 967 matstructT->cprowIndices = new THRUSTINTARRAY; 968 matstructT->cprowIndices->resize(A->cmap->n); 969 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 970 /* assign the pointer */ 971 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 972 PetscFunctionReturn(0); 973 } 974 975 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 976 { 977 PetscInt n = xx->map->n; 978 const PetscScalar *barray; 979 PetscScalar *xarray; 980 thrust::device_ptr<const PetscScalar> bGPU; 981 thrust::device_ptr<PetscScalar> xGPU; 982 cusparseStatus_t stat; 983 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 984 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 985 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 986 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 987 PetscErrorCode ierr; 988 989 PetscFunctionBegin; 990 /* Analyze the matrix and create the transpose ... on the fly */ 991 if (!loTriFactorT && !upTriFactorT) { 992 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 993 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 994 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 995 } 996 997 /* Get the GPU pointers */ 998 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 999 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1000 xGPU = thrust::device_pointer_cast(xarray); 1001 bGPU = thrust::device_pointer_cast(barray); 1002 1003 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1004 /* First, reorder with the row permutation */ 1005 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1006 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1007 xGPU); 1008 1009 /* First, solve U */ 1010 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1011 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1012 upTriFactorT->csrMat->values->data().get(), 1013 upTriFactorT->csrMat->row_offsets->data().get(), 1014 upTriFactorT->csrMat->column_indices->data().get(), 1015 upTriFactorT->solveInfo, 1016 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1017 1018 /* Then, solve L */ 1019 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1020 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1021 loTriFactorT->csrMat->values->data().get(), 1022 loTriFactorT->csrMat->row_offsets->data().get(), 1023 loTriFactorT->csrMat->column_indices->data().get(), 1024 loTriFactorT->solveInfo, 1025 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1026 1027 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1028 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1029 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1030 tempGPU->begin()); 1031 1032 /* Copy the temporary to the full solution. */ 1033 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1034 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1035 1036 /* restore */ 1037 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1038 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1039 ierr = WaitForGPU();CHKERRCUDA(ierr); 1040 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1041 PetscFunctionReturn(0); 1042 } 1043 1044 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1045 { 1046 const PetscScalar *barray; 1047 PetscScalar *xarray; 1048 cusparseStatus_t stat; 1049 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1050 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1051 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1052 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1053 PetscErrorCode ierr; 1054 1055 PetscFunctionBegin; 1056 /* Analyze the matrix and create the transpose ... on the fly */ 1057 if (!loTriFactorT && !upTriFactorT) { 1058 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1059 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1060 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1061 } 1062 1063 /* Get the GPU pointers */ 1064 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1065 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1066 1067 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1068 /* First, solve U */ 1069 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1070 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1071 upTriFactorT->csrMat->values->data().get(), 1072 upTriFactorT->csrMat->row_offsets->data().get(), 1073 upTriFactorT->csrMat->column_indices->data().get(), 1074 upTriFactorT->solveInfo, 1075 barray, tempGPU->data().get());CHKERRCUDA(stat); 1076 1077 /* Then, solve L */ 1078 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1080 loTriFactorT->csrMat->values->data().get(), 1081 loTriFactorT->csrMat->row_offsets->data().get(), 1082 loTriFactorT->csrMat->column_indices->data().get(), 1083 loTriFactorT->solveInfo, 1084 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1085 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1086 1087 /* restore */ 1088 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1089 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1090 ierr = WaitForGPU();CHKERRCUDA(ierr); 1091 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1096 { 1097 const PetscScalar *barray; 1098 PetscScalar *xarray; 1099 thrust::device_ptr<const PetscScalar> bGPU; 1100 thrust::device_ptr<PetscScalar> xGPU; 1101 cusparseStatus_t stat; 1102 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1103 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1104 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1105 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1106 PetscErrorCode ierr; 1107 1108 PetscFunctionBegin; 1109 1110 /* Get the GPU pointers */ 1111 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1112 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1113 xGPU = thrust::device_pointer_cast(xarray); 1114 bGPU = thrust::device_pointer_cast(barray); 1115 1116 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1117 /* First, reorder with the row permutation */ 1118 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1119 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1120 xGPU); 1121 1122 /* Next, solve L */ 1123 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1124 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1125 loTriFactor->csrMat->values->data().get(), 1126 loTriFactor->csrMat->row_offsets->data().get(), 1127 loTriFactor->csrMat->column_indices->data().get(), 1128 loTriFactor->solveInfo, 1129 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1130 1131 /* Then, solve U */ 1132 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1133 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1134 upTriFactor->csrMat->values->data().get(), 1135 upTriFactor->csrMat->row_offsets->data().get(), 1136 upTriFactor->csrMat->column_indices->data().get(), 1137 upTriFactor->solveInfo, 1138 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1139 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1140 1141 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1142 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1143 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1144 tempGPU->begin()); 1145 1146 /* Copy the temporary to the full solution. */ 1147 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1148 1149 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1150 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1151 ierr = WaitForGPU();CHKERRCUDA(ierr); 1152 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1157 { 1158 const PetscScalar *barray; 1159 PetscScalar *xarray; 1160 cusparseStatus_t stat; 1161 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1162 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1163 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1164 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1165 PetscErrorCode ierr; 1166 1167 PetscFunctionBegin; 1168 /* Get the GPU pointers */ 1169 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1170 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1171 1172 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1173 /* First, solve L */ 1174 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1175 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1176 loTriFactor->csrMat->values->data().get(), 1177 loTriFactor->csrMat->row_offsets->data().get(), 1178 loTriFactor->csrMat->column_indices->data().get(), 1179 loTriFactor->solveInfo, 1180 barray, tempGPU->data().get());CHKERRCUDA(stat); 1181 1182 /* Next, solve U */ 1183 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1184 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1185 upTriFactor->csrMat->values->data().get(), 1186 upTriFactor->csrMat->row_offsets->data().get(), 1187 upTriFactor->csrMat->column_indices->data().get(), 1188 upTriFactor->solveInfo, 1189 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1190 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1191 1192 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1193 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1194 ierr = WaitForGPU();CHKERRCUDA(ierr); 1195 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1200 { 1201 1202 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1203 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1204 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1205 PetscInt m = A->rmap->n,*ii,*ridx; 1206 PetscErrorCode ierr; 1207 cusparseStatus_t stat; 1208 cudaError_t err; 1209 1210 PetscFunctionBegin; 1211 if (A->pinnedtocpu) PetscFunctionReturn(0); 1212 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1213 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1214 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1215 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1216 /* copy values only */ 1217 matrix->values->assign(a->a, a->a+a->nz); 1218 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1219 } else { 1220 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1221 try { 1222 cusparsestruct->nonzerorow=0; 1223 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1224 1225 if (a->compressedrow.use) { 1226 m = a->compressedrow.nrows; 1227 ii = a->compressedrow.i; 1228 ridx = a->compressedrow.rindex; 1229 } else { 1230 /* Forcing compressed row on the GPU */ 1231 int k=0; 1232 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1233 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1234 ii[0]=0; 1235 for (int j = 0; j<m; j++) { 1236 if ((a->i[j+1]-a->i[j])>0) { 1237 ii[k] = a->i[j]; 1238 ridx[k]= j; 1239 k++; 1240 } 1241 } 1242 ii[cusparsestruct->nonzerorow] = a->nz; 1243 m = cusparsestruct->nonzerorow; 1244 } 1245 1246 /* allocate space for the triangular factor information */ 1247 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1248 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1249 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1250 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1251 1252 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1253 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1254 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1255 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1256 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1257 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1258 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1259 1260 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1261 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1262 /* set the matrix */ 1263 CsrMatrix *matrix= new CsrMatrix; 1264 matrix->num_rows = m; 1265 matrix->num_cols = A->cmap->n; 1266 matrix->num_entries = a->nz; 1267 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1268 matrix->row_offsets->assign(ii, ii + m+1); 1269 1270 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1271 matrix->column_indices->assign(a->j, a->j+a->nz); 1272 1273 matrix->values = new THRUSTARRAY(a->nz); 1274 matrix->values->assign(a->a, a->a+a->nz); 1275 1276 /* assign the pointer */ 1277 matstruct->mat = matrix; 1278 1279 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1280 #if CUDA_VERSION>=4020 1281 CsrMatrix *matrix= new CsrMatrix; 1282 matrix->num_rows = m; 1283 matrix->num_cols = A->cmap->n; 1284 matrix->num_entries = a->nz; 1285 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1286 matrix->row_offsets->assign(ii, ii + m+1); 1287 1288 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1289 matrix->column_indices->assign(a->j, a->j+a->nz); 1290 1291 matrix->values = new THRUSTARRAY(a->nz); 1292 matrix->values->assign(a->a, a->a+a->nz); 1293 1294 cusparseHybMat_t hybMat; 1295 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1296 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1297 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1298 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1299 matstruct->descr, matrix->values->data().get(), 1300 matrix->row_offsets->data().get(), 1301 matrix->column_indices->data().get(), 1302 hybMat, 0, partition);CHKERRCUDA(stat); 1303 /* assign the pointer */ 1304 matstruct->mat = hybMat; 1305 1306 if (matrix) { 1307 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1308 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1309 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1310 delete (CsrMatrix*)matrix; 1311 } 1312 #endif 1313 } 1314 1315 /* assign the compressed row indices */ 1316 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1317 matstruct->cprowIndices->assign(ridx,ridx+m); 1318 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1319 1320 /* assign the pointer */ 1321 cusparsestruct->mat = matstruct; 1322 1323 if (!a->compressedrow.use) { 1324 ierr = PetscFree(ii);CHKERRQ(ierr); 1325 ierr = PetscFree(ridx);CHKERRQ(ierr); 1326 } 1327 cusparsestruct->workVector = new THRUSTARRAY(m); 1328 } catch(char *ex) { 1329 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1330 } 1331 cusparsestruct->nonzerostate = A->nonzerostate; 1332 } 1333 ierr = WaitForGPU();CHKERRCUDA(ierr); 1334 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1335 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1336 } 1337 PetscFunctionReturn(0); 1338 } 1339 1340 struct VecCUDAPlusEquals 1341 { 1342 template <typename Tuple> 1343 __host__ __device__ 1344 void operator()(Tuple t) 1345 { 1346 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1347 } 1348 }; 1349 1350 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1351 { 1352 PetscErrorCode ierr; 1353 1354 PetscFunctionBegin; 1355 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1356 PetscFunctionReturn(0); 1357 } 1358 1359 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1360 { 1361 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1362 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1363 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1364 const PetscScalar *xarray; 1365 PetscScalar *yarray; 1366 PetscErrorCode ierr; 1367 cusparseStatus_t stat; 1368 1369 PetscFunctionBegin; 1370 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1371 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1372 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1373 if (!matstructT) { 1374 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1375 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1376 } 1377 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1378 ierr = VecSet(yy,0);CHKERRQ(ierr); 1379 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1380 1381 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1382 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1383 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1384 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1385 mat->num_rows, mat->num_cols, 1386 mat->num_entries, matstructT->alpha, matstructT->descr, 1387 mat->values->data().get(), mat->row_offsets->data().get(), 1388 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1389 yarray);CHKERRCUDA(stat); 1390 } else { 1391 #if CUDA_VERSION>=4020 1392 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1393 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1394 matstructT->alpha, matstructT->descr, hybMat, 1395 xarray, matstructT->beta_zero, 1396 yarray);CHKERRCUDA(stat); 1397 #endif 1398 } 1399 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1400 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1401 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1402 if (!cusparsestruct->stream) { 1403 ierr = WaitForGPU();CHKERRCUDA(ierr); 1404 } 1405 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 1409 1410 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1411 { 1412 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1413 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1414 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1415 const PetscScalar *xarray; 1416 PetscScalar *zarray,*dptr,*beta; 1417 PetscErrorCode ierr; 1418 cusparseStatus_t stat; 1419 1420 PetscFunctionBegin; 1421 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1422 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1423 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1424 try { 1425 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1426 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1427 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1428 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1429 1430 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1431 /* csr_spmv is multiply add */ 1432 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1433 /* here we need to be careful to set the number of rows in the multiply to the 1434 number of compressed rows in the matrix ... which is equivalent to the 1435 size of the workVector */ 1436 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1437 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1438 mat->num_rows, mat->num_cols, 1439 mat->num_entries, matstruct->alpha, matstruct->descr, 1440 mat->values->data().get(), mat->row_offsets->data().get(), 1441 mat->column_indices->data().get(), xarray, beta, 1442 dptr);CHKERRCUDA(stat); 1443 } else { 1444 #if CUDA_VERSION>=4020 1445 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1446 if (cusparsestruct->workVector->size()) { 1447 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1448 matstruct->alpha, matstruct->descr, hybMat, 1449 xarray, beta, 1450 dptr);CHKERRCUDA(stat); 1451 } 1452 #endif 1453 } 1454 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1455 1456 if (yy) { 1457 if (dptr != zarray) { 1458 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1459 } else if (zz != yy) { 1460 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1461 } 1462 } else if (dptr != zarray) { 1463 ierr = VecSet(zz,0);CHKERRQ(ierr); 1464 } 1465 /* scatter the data from the temporary into the full vector with a += operation */ 1466 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1467 if (dptr != zarray) { 1468 thrust::device_ptr<PetscScalar> zptr; 1469 1470 zptr = thrust::device_pointer_cast(zarray); 1471 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1472 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1473 VecCUDAPlusEquals()); 1474 } 1475 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1476 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1477 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1478 } catch(char *ex) { 1479 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1480 } 1481 if (!yy) { /* MatMult */ 1482 if (!cusparsestruct->stream) { 1483 ierr = WaitForGPU();CHKERRCUDA(ierr); 1484 } 1485 } 1486 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1487 PetscFunctionReturn(0); 1488 } 1489 1490 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1491 { 1492 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1493 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1494 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1495 thrust::device_ptr<PetscScalar> zptr; 1496 const PetscScalar *xarray; 1497 PetscScalar *zarray; 1498 PetscErrorCode ierr; 1499 cusparseStatus_t stat; 1500 1501 PetscFunctionBegin; 1502 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1503 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1504 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1505 if (!matstructT) { 1506 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1507 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1508 } 1509 1510 try { 1511 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1512 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1513 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1514 zptr = thrust::device_pointer_cast(zarray); 1515 1516 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1517 /* multiply add with matrix transpose */ 1518 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1519 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1520 /* here we need to be careful to set the number of rows in the multiply to the 1521 number of compressed rows in the matrix ... which is equivalent to the 1522 size of the workVector */ 1523 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1524 mat->num_rows, mat->num_cols, 1525 mat->num_entries, matstructT->alpha, matstructT->descr, 1526 mat->values->data().get(), mat->row_offsets->data().get(), 1527 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1528 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1529 } else { 1530 #if CUDA_VERSION>=4020 1531 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1532 if (cusparsestruct->workVector->size()) { 1533 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1534 matstructT->alpha, matstructT->descr, hybMat, 1535 xarray, matstructT->beta_zero, 1536 cusparsestruct->workVector->data().get());CHKERRCUDA(stat); 1537 } 1538 #endif 1539 } 1540 1541 /* scatter the data from the temporary into the full vector with a += operation */ 1542 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1543 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1544 VecCUDAPlusEquals()); 1545 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1546 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1547 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1548 1549 } catch(char *ex) { 1550 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1551 } 1552 ierr = WaitForGPU();CHKERRCUDA(ierr); 1553 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1558 { 1559 PetscErrorCode ierr; 1560 1561 PetscFunctionBegin; 1562 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1563 if (A->factortype==MAT_FACTOR_NONE) { 1564 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1565 } 1566 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1567 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1568 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1569 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1570 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1571 PetscFunctionReturn(0); 1572 } 1573 1574 /* --------------------------------------------------------------------------------*/ 1575 /*@ 1576 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1577 (the default parallel PETSc format). This matrix will ultimately pushed down 1578 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1579 assembly performance the user should preallocate the matrix storage by setting 1580 the parameter nz (or the array nnz). By setting these parameters accurately, 1581 performance during matrix assembly can be increased by more than a factor of 50. 1582 1583 Collective 1584 1585 Input Parameters: 1586 + comm - MPI communicator, set to PETSC_COMM_SELF 1587 . m - number of rows 1588 . n - number of columns 1589 . nz - number of nonzeros per row (same for all rows) 1590 - nnz - array containing the number of nonzeros in the various rows 1591 (possibly different for each row) or NULL 1592 1593 Output Parameter: 1594 . A - the matrix 1595 1596 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1597 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1598 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1599 1600 Notes: 1601 If nnz is given then nz is ignored 1602 1603 The AIJ format (also called the Yale sparse matrix format or 1604 compressed row storage), is fully compatible with standard Fortran 77 1605 storage. That is, the stored row and column indices can begin at 1606 either one (as in Fortran) or zero. See the users' manual for details. 1607 1608 Specify the preallocated storage with either nz or nnz (not both). 1609 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1610 allocation. For large problems you MUST preallocate memory or you 1611 will get TERRIBLE performance, see the users' manual chapter on matrices. 1612 1613 By default, this format uses inodes (identical nodes) when possible, to 1614 improve numerical efficiency of matrix-vector products and solves. We 1615 search for consecutive rows with the same nonzero structure, thereby 1616 reusing matrix information to achieve increased efficiency. 1617 1618 Level: intermediate 1619 1620 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1621 @*/ 1622 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1623 { 1624 PetscErrorCode ierr; 1625 1626 PetscFunctionBegin; 1627 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1628 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1629 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1630 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1631 PetscFunctionReturn(0); 1632 } 1633 1634 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1635 { 1636 PetscErrorCode ierr; 1637 1638 PetscFunctionBegin; 1639 if (A->factortype==MAT_FACTOR_NONE) { 1640 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1641 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1642 } 1643 } else { 1644 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1645 } 1646 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1647 PetscFunctionReturn(0); 1648 } 1649 1650 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1651 { 1652 PetscErrorCode ierr; 1653 Mat C; 1654 cusparseStatus_t stat; 1655 cusparseHandle_t handle=0; 1656 1657 PetscFunctionBegin; 1658 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1659 C = *B; 1660 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1661 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1662 1663 /* inject CUSPARSE-specific stuff */ 1664 if (C->factortype==MAT_FACTOR_NONE) { 1665 /* you cannot check the inode.use flag here since the matrix was just created. 1666 now build a GPU matrix data structure */ 1667 C->spptr = new Mat_SeqAIJCUSPARSE; 1668 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1669 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1670 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1671 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1672 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1673 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1674 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1675 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1676 } else { 1677 /* NEXT, set the pointers to the triangular factors */ 1678 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1679 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1680 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1681 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1682 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1683 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1684 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1685 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1686 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1687 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1688 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1689 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1690 } 1691 1692 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1693 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1694 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1695 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1696 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1697 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1698 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1699 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1700 1701 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1702 1703 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1704 1705 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1706 PetscFunctionReturn(0); 1707 } 1708 1709 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1710 { 1711 PetscErrorCode ierr; 1712 cusparseStatus_t stat; 1713 cusparseHandle_t handle=0; 1714 1715 PetscFunctionBegin; 1716 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1717 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1718 1719 if (B->factortype==MAT_FACTOR_NONE) { 1720 /* you cannot check the inode.use flag here since the matrix was just created. 1721 now build a GPU matrix data structure */ 1722 B->spptr = new Mat_SeqAIJCUSPARSE; 1723 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1724 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1725 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1726 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1727 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1728 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1729 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1730 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1731 } else { 1732 /* NEXT, set the pointers to the triangular factors */ 1733 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1734 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1735 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1736 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1737 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1738 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1739 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1740 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1741 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1742 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1743 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1744 } 1745 1746 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1747 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1748 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1749 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1750 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1751 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1752 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1753 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1754 1755 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1756 1757 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1758 1759 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1760 PetscFunctionReturn(0); 1761 } 1762 1763 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1764 { 1765 PetscErrorCode ierr; 1766 1767 PetscFunctionBegin; 1768 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1769 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B); 1770 PetscFunctionReturn(0); 1771 } 1772 1773 /*MC 1774 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1775 1776 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1777 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1778 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1779 1780 Options Database Keys: 1781 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1782 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1783 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1784 1785 Level: beginner 1786 1787 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1788 M*/ 1789 1790 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1791 1792 1793 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1794 { 1795 PetscErrorCode ierr; 1796 1797 PetscFunctionBegin; 1798 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1799 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1800 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1801 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1802 PetscFunctionReturn(0); 1803 } 1804 1805 1806 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1807 { 1808 cusparseStatus_t stat; 1809 cusparseHandle_t handle; 1810 1811 PetscFunctionBegin; 1812 if (*cusparsestruct) { 1813 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1814 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1815 delete (*cusparsestruct)->workVector; 1816 if (handle = (*cusparsestruct)->handle) { 1817 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1818 } 1819 delete *cusparsestruct; 1820 *cusparsestruct = 0; 1821 } 1822 PetscFunctionReturn(0); 1823 } 1824 1825 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1826 { 1827 PetscFunctionBegin; 1828 if (*mat) { 1829 delete (*mat)->values; 1830 delete (*mat)->column_indices; 1831 delete (*mat)->row_offsets; 1832 delete *mat; 1833 *mat = 0; 1834 } 1835 PetscFunctionReturn(0); 1836 } 1837 1838 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1839 { 1840 cusparseStatus_t stat; 1841 PetscErrorCode ierr; 1842 1843 PetscFunctionBegin; 1844 if (*trifactor) { 1845 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1846 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1847 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1848 delete *trifactor; 1849 *trifactor = 0; 1850 } 1851 PetscFunctionReturn(0); 1852 } 1853 1854 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1855 { 1856 CsrMatrix *mat; 1857 cusparseStatus_t stat; 1858 cudaError_t err; 1859 1860 PetscFunctionBegin; 1861 if (*matstruct) { 1862 if ((*matstruct)->mat) { 1863 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1864 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1865 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1866 } else { 1867 mat = (CsrMatrix*)(*matstruct)->mat; 1868 CsrMatrix_Destroy(&mat); 1869 } 1870 } 1871 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1872 delete (*matstruct)->cprowIndices; 1873 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1874 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1875 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1876 delete *matstruct; 1877 *matstruct = 0; 1878 } 1879 PetscFunctionReturn(0); 1880 } 1881 1882 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1883 { 1884 cusparseHandle_t handle; 1885 cusparseStatus_t stat; 1886 1887 PetscFunctionBegin; 1888 if (*trifactors) { 1889 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1890 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1891 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1892 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1893 delete (*trifactors)->rpermIndices; 1894 delete (*trifactors)->cpermIndices; 1895 delete (*trifactors)->workVector; 1896 if (handle = (*trifactors)->handle) { 1897 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1898 } 1899 delete *trifactors; 1900 *trifactors = 0; 1901 } 1902 PetscFunctionReturn(0); 1903 } 1904 1905