1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_SPINLOCK 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 16 17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 20 21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 24 25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 34 35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 40 41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 42 { 43 cusparseStatus_t stat; 44 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 45 46 PetscFunctionBegin; 47 cusparsestruct->stream = stream; 48 stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat); 49 PetscFunctionReturn(0); 50 } 51 52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 53 { 54 cusparseStatus_t stat; 55 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 56 57 PetscFunctionBegin; 58 if (cusparsestruct->handle != handle) { 59 if (cusparsestruct->handle) { 60 stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat); 61 } 62 cusparsestruct->handle = handle; 63 } 64 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 65 PetscFunctionReturn(0); 66 } 67 68 PetscErrorCode MatCUSPARSEClearHandle(Mat A) 69 { 70 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 71 PetscFunctionBegin; 72 if (cusparsestruct->handle) 73 cusparsestruct->handle = 0; 74 PetscFunctionReturn(0); 75 } 76 77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 78 { 79 PetscFunctionBegin; 80 *type = MATSOLVERCUSPARSE; 81 PetscFunctionReturn(0); 82 } 83 84 /*MC 85 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 86 on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 87 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 88 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 89 CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 90 algorithms are not recommended. This class does NOT support direct solver operations. 91 92 Level: beginner 93 94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 95 M*/ 96 97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 98 { 99 PetscErrorCode ierr; 100 PetscInt n = A->rmap->n; 101 102 PetscFunctionBegin; 103 ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 104 (*B)->factortype = ftype; 105 ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 106 ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 107 108 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 109 ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 110 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 111 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 112 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 113 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 114 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 115 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 116 117 ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 118 ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 123 { 124 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 125 126 PetscFunctionBegin; 127 switch (op) { 128 case MAT_CUSPARSE_MULT: 129 cusparsestruct->format = format; 130 break; 131 case MAT_CUSPARSE_ALL: 132 cusparsestruct->format = format; 133 break; 134 default: 135 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 136 } 137 PetscFunctionReturn(0); 138 } 139 140 /*@ 141 MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 142 operation. Only the MatMult operation can use different GPU storage formats 143 for MPIAIJCUSPARSE matrices. 144 Not Collective 145 146 Input Parameters: 147 + A - Matrix of type SEQAIJCUSPARSE 148 . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 149 - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 150 151 Output Parameter: 152 153 Level: intermediate 154 155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 156 @*/ 157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 158 { 159 PetscErrorCode ierr; 160 161 PetscFunctionBegin; 162 PetscValidHeaderSpecific(A, MAT_CLASSID,1); 163 ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 168 { 169 PetscErrorCode ierr; 170 MatCUSPARSEStorageFormat format; 171 PetscBool flg; 172 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 173 174 PetscFunctionBegin; 175 ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 176 if (A->factortype==MAT_FACTOR_NONE) { 177 ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 178 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 179 if (flg) { 180 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr); 181 } 182 } 183 ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 184 "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 185 if (flg) { 186 ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 187 } 188 ierr = PetscOptionsTail();CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 191 } 192 193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 194 { 195 PetscErrorCode ierr; 196 197 PetscFunctionBegin; 198 ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 199 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 200 PetscFunctionReturn(0); 201 } 202 203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 204 { 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 209 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 214 { 215 PetscErrorCode ierr; 216 217 PetscFunctionBegin; 218 ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 219 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 220 PetscFunctionReturn(0); 221 } 222 223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 229 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 230 PetscFunctionReturn(0); 231 } 232 233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscInt n = A->rmap->n; 237 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 238 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 239 cusparseStatus_t stat; 240 const PetscInt *ai = a->i,*aj = a->j,*vi; 241 const MatScalar *aa = a->a,*v; 242 PetscInt *AiLo, *AjLo; 243 PetscScalar *AALo; 244 PetscInt i,nz, nzLower, offset, rowOffset; 245 PetscErrorCode ierr; 246 247 PetscFunctionBegin; 248 if (!n) PetscFunctionReturn(0); 249 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 250 try { 251 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 252 nzLower=n+ai[n]-ai[1]; 253 254 /* Allocate Space for the lower triangular matrix */ 255 ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 256 ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr); 257 ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr); 258 259 /* Fill the lower triangular matrix */ 260 AiLo[0] = (PetscInt) 0; 261 AiLo[n] = nzLower; 262 AjLo[0] = (PetscInt) 0; 263 AALo[0] = (MatScalar) 1.0; 264 v = aa; 265 vi = aj; 266 offset = 1; 267 rowOffset= 1; 268 for (i=1; i<n; i++) { 269 nz = ai[i+1] - ai[i]; 270 /* additional 1 for the term on the diagonal */ 271 AiLo[i] = rowOffset; 272 rowOffset += nz+1; 273 274 ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 275 ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 276 277 offset += nz; 278 AjLo[offset] = (PetscInt) i; 279 AALo[offset] = (MatScalar) 1.0; 280 offset += 1; 281 282 v += nz; 283 vi += nz; 284 } 285 286 /* allocate space for the triangular factor information */ 287 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 288 289 /* Create the matrix description */ 290 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 291 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 292 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 293 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat); 294 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 295 296 /* Create the solve analysis information */ 297 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 298 299 /* set the operation */ 300 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 301 302 /* set the matrix */ 303 loTriFactor->csrMat = new CsrMatrix; 304 loTriFactor->csrMat->num_rows = n; 305 loTriFactor->csrMat->num_cols = n; 306 loTriFactor->csrMat->num_entries = nzLower; 307 308 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 309 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 310 311 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 312 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 313 314 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 315 loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 316 317 /* perform the solve analysis */ 318 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 319 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 320 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 321 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 322 323 /* assign the pointer. Is this really necessary? */ 324 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 325 326 ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr); 327 ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr); 328 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 329 ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 330 } catch(char *ex) { 331 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 332 } 333 } 334 PetscFunctionReturn(0); 335 } 336 337 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 338 { 339 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 340 PetscInt n = A->rmap->n; 341 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 342 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 343 cusparseStatus_t stat; 344 const PetscInt *aj = a->j,*adiag = a->diag,*vi; 345 const MatScalar *aa = a->a,*v; 346 PetscInt *AiUp, *AjUp; 347 PetscScalar *AAUp; 348 PetscInt i,nz, nzUpper, offset; 349 PetscErrorCode ierr; 350 351 PetscFunctionBegin; 352 if (!n) PetscFunctionReturn(0); 353 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 354 try { 355 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 356 nzUpper = adiag[0]-adiag[n]; 357 358 /* Allocate Space for the upper triangular matrix */ 359 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 360 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 361 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 362 363 /* Fill the upper triangular matrix */ 364 AiUp[0]=(PetscInt) 0; 365 AiUp[n]=nzUpper; 366 offset = nzUpper; 367 for (i=n-1; i>=0; i--) { 368 v = aa + adiag[i+1] + 1; 369 vi = aj + adiag[i+1] + 1; 370 371 /* number of elements NOT on the diagonal */ 372 nz = adiag[i] - adiag[i+1]-1; 373 374 /* decrement the offset */ 375 offset -= (nz+1); 376 377 /* first, set the diagonal elements */ 378 AjUp[offset] = (PetscInt) i; 379 AAUp[offset] = (MatScalar)1./v[nz]; 380 AiUp[i] = AiUp[i+1] - (nz+1); 381 382 ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 383 ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 384 } 385 386 /* allocate space for the triangular factor information */ 387 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 388 389 /* Create the matrix description */ 390 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 391 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 392 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 393 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 394 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 395 396 /* Create the solve analysis information */ 397 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 398 399 /* set the operation */ 400 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 401 402 /* set the matrix */ 403 upTriFactor->csrMat = new CsrMatrix; 404 upTriFactor->csrMat->num_rows = n; 405 upTriFactor->csrMat->num_cols = n; 406 upTriFactor->csrMat->num_entries = nzUpper; 407 408 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 409 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 410 411 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 412 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 413 414 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 415 upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 416 417 /* perform the solve analysis */ 418 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 419 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 420 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 421 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 422 423 /* assign the pointer. Is this really necessary? */ 424 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 425 426 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 427 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 428 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 429 ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 430 } catch(char *ex) { 431 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 432 } 433 } 434 PetscFunctionReturn(0); 435 } 436 437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 438 { 439 PetscErrorCode ierr; 440 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 441 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 442 IS isrow = a->row,iscol = a->icol; 443 PetscBool row_identity,col_identity; 444 const PetscInt *r,*c; 445 PetscInt n = A->rmap->n; 446 447 PetscFunctionBegin; 448 ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 449 ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 450 451 cusparseTriFactors->workVector = new THRUSTARRAY(n); 452 cusparseTriFactors->nnz=a->nz; 453 454 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 455 /*lower triangular indices */ 456 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 457 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 458 if (!row_identity) { 459 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 460 cusparseTriFactors->rpermIndices->assign(r, r+n); 461 } 462 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 463 464 /*upper triangular indices */ 465 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 467 if (!col_identity) { 468 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 469 cusparseTriFactors->cpermIndices->assign(c, c+n); 470 } 471 472 if(!row_identity && !col_identity) { 473 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 474 } else if(!row_identity) { 475 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 476 } else if(!col_identity) { 477 ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 478 } 479 480 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 481 PetscFunctionReturn(0); 482 } 483 484 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 485 { 486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 487 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 488 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 489 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 490 cusparseStatus_t stat; 491 PetscErrorCode ierr; 492 PetscInt *AiUp, *AjUp; 493 PetscScalar *AAUp; 494 PetscScalar *AALo; 495 PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 496 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 497 const PetscInt *ai = b->i,*aj = b->j,*vj; 498 const MatScalar *aa = b->a,*v; 499 500 PetscFunctionBegin; 501 if (!n) PetscFunctionReturn(0); 502 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 503 try { 504 /* Allocate Space for the upper triangular matrix */ 505 ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr); 506 ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr); 507 ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 508 ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr); 509 510 /* Fill the upper triangular matrix */ 511 AiUp[0]=(PetscInt) 0; 512 AiUp[n]=nzUpper; 513 offset = 0; 514 for (i=0; i<n; i++) { 515 /* set the pointers */ 516 v = aa + ai[i]; 517 vj = aj + ai[i]; 518 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 519 520 /* first, set the diagonal elements */ 521 AjUp[offset] = (PetscInt) i; 522 AAUp[offset] = (MatScalar)1.0/v[nz]; 523 AiUp[i] = offset; 524 AALo[offset] = (MatScalar)1.0/v[nz]; 525 526 offset+=1; 527 if (nz>0) { 528 ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 529 ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 530 for (j=offset; j<offset+nz; j++) { 531 AAUp[j] = -AAUp[j]; 532 AALo[j] = AAUp[j]/v[nz]; 533 } 534 offset+=nz; 535 } 536 } 537 538 /* allocate space for the triangular factor information */ 539 upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 540 541 /* Create the matrix description */ 542 stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat); 543 stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 544 stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 545 stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 546 stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat); 547 548 /* Create the solve analysis information */ 549 stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat); 550 551 /* set the operation */ 552 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 553 554 /* set the matrix */ 555 upTriFactor->csrMat = new CsrMatrix; 556 upTriFactor->csrMat->num_rows = A->rmap->n; 557 upTriFactor->csrMat->num_cols = A->cmap->n; 558 upTriFactor->csrMat->num_entries = a->nz; 559 560 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 561 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 562 563 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 564 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 565 566 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 567 upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 568 569 /* perform the solve analysis */ 570 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 571 upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 572 upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 573 upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat); 574 575 /* assign the pointer. Is this really necessary? */ 576 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 577 578 /* allocate space for the triangular factor information */ 579 loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 580 581 /* Create the matrix description */ 582 stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat); 583 stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 584 stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat); 585 stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat); 586 stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat); 587 588 /* Create the solve analysis information */ 589 stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat); 590 591 /* set the operation */ 592 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 593 594 /* set the matrix */ 595 loTriFactor->csrMat = new CsrMatrix; 596 loTriFactor->csrMat->num_rows = A->rmap->n; 597 loTriFactor->csrMat->num_cols = A->cmap->n; 598 loTriFactor->csrMat->num_entries = a->nz; 599 600 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 601 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 602 603 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 604 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 605 606 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 607 loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 608 ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 609 610 /* perform the solve analysis */ 611 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 612 loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 613 loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 614 loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat); 615 616 /* assign the pointer. Is this really necessary? */ 617 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 618 619 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 620 ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr); 621 ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr); 622 ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr); 623 ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr); 624 } catch(char *ex) { 625 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 626 } 627 } 628 PetscFunctionReturn(0); 629 } 630 631 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 632 { 633 PetscErrorCode ierr; 634 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 635 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 636 IS ip = a->row; 637 const PetscInt *rip; 638 PetscBool perm_identity; 639 PetscInt n = A->rmap->n; 640 641 PetscFunctionBegin; 642 ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 643 cusparseTriFactors->workVector = new THRUSTARRAY(n); 644 cusparseTriFactors->nnz=(a->nz-n)*2 + n; 645 646 /*lower triangular indices */ 647 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 648 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 649 if (!perm_identity) { 650 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 651 cusparseTriFactors->rpermIndices->assign(rip, rip+n); 652 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 653 cusparseTriFactors->cpermIndices->assign(rip, rip+n); 654 ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 655 } 656 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 657 PetscFunctionReturn(0); 658 } 659 660 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 661 { 662 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 663 IS isrow = b->row,iscol = b->col; 664 PetscBool row_identity,col_identity; 665 PetscErrorCode ierr; 666 667 PetscFunctionBegin; 668 ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 669 /* determine which version of MatSolve needs to be used. */ 670 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 671 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 672 if (row_identity && col_identity) { 673 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 674 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 675 } else { 676 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 677 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 678 } 679 680 /* get the triangular factors */ 681 ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 686 { 687 Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 688 IS ip = b->row; 689 PetscBool perm_identity; 690 PetscErrorCode ierr; 691 692 PetscFunctionBegin; 693 ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 694 695 /* determine which version of MatSolve needs to be used. */ 696 ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 697 if (perm_identity) { 698 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 699 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 700 } else { 701 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 702 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 703 } 704 705 /* get the triangular factors */ 706 ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 707 PetscFunctionReturn(0); 708 } 709 710 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 711 { 712 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 713 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 714 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 715 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 716 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 717 cusparseStatus_t stat; 718 cusparseIndexBase_t indexBase; 719 cusparseMatrixType_t matrixType; 720 cusparseFillMode_t fillMode; 721 cusparseDiagType_t diagType; 722 723 PetscFunctionBegin; 724 725 /*********************************************/ 726 /* Now the Transpose of the Lower Tri Factor */ 727 /*********************************************/ 728 729 /* allocate space for the transpose of the lower triangular factor */ 730 loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 731 732 /* set the matrix descriptors of the lower triangular factor */ 733 matrixType = cusparseGetMatType(loTriFactor->descr); 734 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 735 fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 736 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 737 diagType = cusparseGetMatDiagType(loTriFactor->descr); 738 739 /* Create the matrix description */ 740 stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat); 741 stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat); 742 stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat); 743 stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat); 744 stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat); 745 746 /* Create the solve analysis information */ 747 stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat); 748 749 /* set the operation */ 750 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 751 752 /* allocate GPU space for the CSC of the lower triangular factor*/ 753 loTriFactorT->csrMat = new CsrMatrix; 754 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows; 755 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols; 756 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 757 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1); 758 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries); 759 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries); 760 761 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 762 stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 763 loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 764 loTriFactor->csrMat->values->data().get(), 765 loTriFactor->csrMat->row_offsets->data().get(), 766 loTriFactor->csrMat->column_indices->data().get(), 767 loTriFactorT->csrMat->values->data().get(), 768 loTriFactorT->csrMat->column_indices->data().get(), 769 loTriFactorT->csrMat->row_offsets->data().get(), 770 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 771 772 /* perform the solve analysis on the transposed matrix */ 773 stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 774 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, 775 loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 776 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 777 loTriFactorT->solveInfo);CHKERRCUDA(stat); 778 779 /* assign the pointer. Is this really necessary? */ 780 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 781 782 /*********************************************/ 783 /* Now the Transpose of the Upper Tri Factor */ 784 /*********************************************/ 785 786 /* allocate space for the transpose of the upper triangular factor */ 787 upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 788 789 /* set the matrix descriptors of the upper triangular factor */ 790 matrixType = cusparseGetMatType(upTriFactor->descr); 791 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 792 fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 793 CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 794 diagType = cusparseGetMatDiagType(upTriFactor->descr); 795 796 /* Create the matrix description */ 797 stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat); 798 stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat); 799 stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat); 800 stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat); 801 stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat); 802 803 /* Create the solve analysis information */ 804 stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat); 805 806 /* set the operation */ 807 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 808 809 /* allocate GPU space for the CSC of the upper triangular factor*/ 810 upTriFactorT->csrMat = new CsrMatrix; 811 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows; 812 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols; 813 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 814 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1); 815 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries); 816 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries); 817 818 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 819 stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 820 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 821 upTriFactor->csrMat->values->data().get(), 822 upTriFactor->csrMat->row_offsets->data().get(), 823 upTriFactor->csrMat->column_indices->data().get(), 824 upTriFactorT->csrMat->values->data().get(), 825 upTriFactorT->csrMat->column_indices->data().get(), 826 upTriFactorT->csrMat->row_offsets->data().get(), 827 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 828 829 /* perform the solve analysis on the transposed matrix */ 830 stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 831 upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, 832 upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 833 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 834 upTriFactorT->solveInfo);CHKERRCUDA(stat); 835 836 /* assign the pointer. Is this really necessary? */ 837 ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 838 PetscFunctionReturn(0); 839 } 840 841 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 842 { 843 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 844 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 845 Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 846 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 847 cusparseStatus_t stat; 848 cusparseIndexBase_t indexBase; 849 cudaError_t err; 850 PetscErrorCode ierr; 851 852 PetscFunctionBegin; 853 854 /* allocate space for the triangular factor information */ 855 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 856 stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat); 857 indexBase = cusparseGetMatIndexBase(matstruct->descr); 858 stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat); 859 stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 860 861 /* set alpha and beta */ 862 err = cudaMalloc((void **)&(matstructT->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 863 err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 864 err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 865 err = cudaMemcpy(matstructT->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 866 err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 867 err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 868 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 869 870 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 871 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 872 CsrMatrix *matrixT= new CsrMatrix; 873 thrust::device_vector<int> *rowoffsets_gpu; 874 const int ncomprow = matstruct->cprowIndices->size(); 875 thrust::host_vector<int> cprow(ncomprow); 876 cprow = *matstruct->cprowIndices; // GPU --> CPU 877 matrixT->num_rows = A->cmap->n; 878 matrixT->num_cols = A->rmap->n; 879 matrixT->num_entries = a->nz; 880 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 881 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 882 matrixT->values = new THRUSTARRAY(a->nz); 883 PetscInt k,i; 884 thrust::host_vector<int> rowst_host(A->rmap->n+1); 885 886 /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */ 887 rowst_host[0] = 0; 888 for (k = 0, i = 0; i < A->rmap->n ; i++) { 889 if (k < ncomprow && i==cprow[k]) { 890 rowst_host[i+1] = a->i[i+1]; 891 k++; 892 } else rowst_host[i+1] = rowst_host[i]; 893 } 894 if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow); 895 rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 896 *rowoffsets_gpu = rowst_host; // CPU --> GPU 897 898 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 899 indexBase = cusparseGetMatIndexBase(matstruct->descr); 900 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 901 A->cmap->n, matrix->num_entries, 902 matrix->values->data().get(), 903 rowoffsets_gpu->data().get(), 904 matrix->column_indices->data().get(), 905 matrixT->values->data().get(), 906 matrixT->column_indices->data().get(), 907 matrixT->row_offsets->data().get(), 908 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 909 910 /* assign the pointer */ 911 matstructT->mat = matrixT; 912 ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 913 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 914 /* First convert HYB to CSR */ 915 CsrMatrix *temp= new CsrMatrix; 916 temp->num_rows = A->rmap->n; 917 temp->num_cols = A->cmap->n; 918 temp->num_entries = a->nz; 919 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 920 temp->column_indices = new THRUSTINTARRAY32(a->nz); 921 temp->values = new THRUSTARRAY(a->nz); 922 923 924 stat = cusparse_hyb2csr(cusparsestruct->handle, 925 matstruct->descr, (cusparseHybMat_t)matstruct->mat, 926 temp->values->data().get(), 927 temp->row_offsets->data().get(), 928 temp->column_indices->data().get());CHKERRCUDA(stat); 929 930 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 931 CsrMatrix *tempT= new CsrMatrix; 932 tempT->num_rows = A->rmap->n; 933 tempT->num_cols = A->cmap->n; 934 tempT->num_entries = a->nz; 935 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 936 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 937 tempT->values = new THRUSTARRAY(a->nz); 938 939 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 940 temp->num_cols, temp->num_entries, 941 temp->values->data().get(), 942 temp->row_offsets->data().get(), 943 temp->column_indices->data().get(), 944 tempT->values->data().get(), 945 tempT->column_indices->data().get(), 946 tempT->row_offsets->data().get(), 947 CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat); 948 949 /* Last, convert CSC to HYB */ 950 cusparseHybMat_t hybMat; 951 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 952 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 953 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 954 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 955 matstructT->descr, tempT->values->data().get(), 956 tempT->row_offsets->data().get(), 957 tempT->column_indices->data().get(), 958 hybMat, 0, partition);CHKERRCUDA(stat); 959 960 /* assign the pointer */ 961 matstructT->mat = hybMat; 962 ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr); 963 964 /* delete temporaries */ 965 if (tempT) { 966 if (tempT->values) delete (THRUSTARRAY*) tempT->values; 967 if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 968 if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 969 delete (CsrMatrix*) tempT; 970 } 971 if (temp) { 972 if (temp->values) delete (THRUSTARRAY*) temp->values; 973 if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 974 if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 975 delete (CsrMatrix*) temp; 976 } 977 } 978 /* assign the compressed row indices */ 979 matstructT->cprowIndices = new THRUSTINTARRAY; 980 matstructT->cprowIndices->resize(A->cmap->n); 981 thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end()); 982 /* assign the pointer */ 983 ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 984 PetscFunctionReturn(0); 985 } 986 987 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 988 { 989 PetscInt n = xx->map->n; 990 const PetscScalar *barray; 991 PetscScalar *xarray; 992 thrust::device_ptr<const PetscScalar> bGPU; 993 thrust::device_ptr<PetscScalar> xGPU; 994 cusparseStatus_t stat; 995 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 996 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 997 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 998 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 999 PetscErrorCode ierr; 1000 1001 PetscFunctionBegin; 1002 /* Analyze the matrix and create the transpose ... on the fly */ 1003 if (!loTriFactorT && !upTriFactorT) { 1004 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1005 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1006 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1007 } 1008 1009 /* Get the GPU pointers */ 1010 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1011 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1012 xGPU = thrust::device_pointer_cast(xarray); 1013 bGPU = thrust::device_pointer_cast(barray); 1014 1015 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1016 /* First, reorder with the row permutation */ 1017 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1018 thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1019 xGPU); 1020 1021 /* First, solve U */ 1022 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1023 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1024 upTriFactorT->csrMat->values->data().get(), 1025 upTriFactorT->csrMat->row_offsets->data().get(), 1026 upTriFactorT->csrMat->column_indices->data().get(), 1027 upTriFactorT->solveInfo, 1028 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1029 1030 /* Then, solve L */ 1031 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1032 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1033 loTriFactorT->csrMat->values->data().get(), 1034 loTriFactorT->csrMat->row_offsets->data().get(), 1035 loTriFactorT->csrMat->column_indices->data().get(), 1036 loTriFactorT->solveInfo, 1037 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1038 1039 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1040 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1041 thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1042 tempGPU->begin()); 1043 1044 /* Copy the temporary to the full solution. */ 1045 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1046 1047 /* restore */ 1048 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1049 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1050 ierr = WaitForGPU();CHKERRCUDA(ierr); 1051 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1052 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1053 PetscFunctionReturn(0); 1054 } 1055 1056 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1057 { 1058 const PetscScalar *barray; 1059 PetscScalar *xarray; 1060 cusparseStatus_t stat; 1061 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1062 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1063 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1064 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1065 PetscErrorCode ierr; 1066 1067 PetscFunctionBegin; 1068 /* Analyze the matrix and create the transpose ... on the fly */ 1069 if (!loTriFactorT && !upTriFactorT) { 1070 ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1071 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1072 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1073 } 1074 1075 /* Get the GPU pointers */ 1076 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1077 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1078 1079 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1080 /* First, solve U */ 1081 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1082 upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1083 upTriFactorT->csrMat->values->data().get(), 1084 upTriFactorT->csrMat->row_offsets->data().get(), 1085 upTriFactorT->csrMat->column_indices->data().get(), 1086 upTriFactorT->solveInfo, 1087 barray, tempGPU->data().get());CHKERRCUDA(stat); 1088 1089 /* Then, solve L */ 1090 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1091 loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1092 loTriFactorT->csrMat->values->data().get(), 1093 loTriFactorT->csrMat->row_offsets->data().get(), 1094 loTriFactorT->csrMat->column_indices->data().get(), 1095 loTriFactorT->solveInfo, 1096 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1097 1098 /* restore */ 1099 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1100 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1101 ierr = WaitForGPU();CHKERRCUDA(ierr); 1102 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1103 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1104 PetscFunctionReturn(0); 1105 } 1106 1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1108 { 1109 const PetscScalar *barray; 1110 PetscScalar *xarray; 1111 thrust::device_ptr<const PetscScalar> bGPU; 1112 thrust::device_ptr<PetscScalar> xGPU; 1113 cusparseStatus_t stat; 1114 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1115 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1116 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1117 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1118 PetscErrorCode ierr; 1119 1120 PetscFunctionBegin; 1121 1122 /* Get the GPU pointers */ 1123 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1124 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1125 xGPU = thrust::device_pointer_cast(xarray); 1126 bGPU = thrust::device_pointer_cast(barray); 1127 1128 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1129 /* First, reorder with the row permutation */ 1130 thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1131 thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 1132 xGPU); 1133 1134 /* Next, solve L */ 1135 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1136 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1137 loTriFactor->csrMat->values->data().get(), 1138 loTriFactor->csrMat->row_offsets->data().get(), 1139 loTriFactor->csrMat->column_indices->data().get(), 1140 loTriFactor->solveInfo, 1141 xarray, tempGPU->data().get());CHKERRCUDA(stat); 1142 1143 /* Then, solve U */ 1144 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1145 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1146 upTriFactor->csrMat->values->data().get(), 1147 upTriFactor->csrMat->row_offsets->data().get(), 1148 upTriFactor->csrMat->column_indices->data().get(), 1149 upTriFactor->solveInfo, 1150 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1151 1152 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1153 thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1154 thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()), 1155 tempGPU->begin()); 1156 1157 /* Copy the temporary to the full solution. */ 1158 thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1159 1160 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1161 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1162 ierr = WaitForGPU();CHKERRCUDA(ierr); 1163 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1164 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1165 PetscFunctionReturn(0); 1166 } 1167 1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1169 { 1170 const PetscScalar *barray; 1171 PetscScalar *xarray; 1172 cusparseStatus_t stat; 1173 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1174 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1175 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1176 THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1177 PetscErrorCode ierr; 1178 1179 PetscFunctionBegin; 1180 /* Get the GPU pointers */ 1181 ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1182 ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1183 1184 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1185 /* First, solve L */ 1186 stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1187 loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1188 loTriFactor->csrMat->values->data().get(), 1189 loTriFactor->csrMat->row_offsets->data().get(), 1190 loTriFactor->csrMat->column_indices->data().get(), 1191 loTriFactor->solveInfo, 1192 barray, tempGPU->data().get());CHKERRCUDA(stat); 1193 1194 /* Next, solve U */ 1195 stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1196 upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1197 upTriFactor->csrMat->values->data().get(), 1198 upTriFactor->csrMat->row_offsets->data().get(), 1199 upTriFactor->csrMat->column_indices->data().get(), 1200 upTriFactor->solveInfo, 1201 tempGPU->data().get(), xarray);CHKERRCUDA(stat); 1202 1203 ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1204 ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 1205 ierr = WaitForGPU();CHKERRCUDA(ierr); 1206 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1207 ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1208 PetscFunctionReturn(0); 1209 } 1210 1211 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 1212 { 1213 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1214 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1215 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1216 PetscInt m = A->rmap->n,*ii,*ridx; 1217 PetscErrorCode ierr; 1218 cusparseStatus_t stat; 1219 cudaError_t err; 1220 1221 PetscFunctionBegin; 1222 if (A->pinnedtocpu) PetscFunctionReturn(0); 1223 if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) { 1224 ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1225 if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 1226 CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1227 /* copy values only */ 1228 matrix->values->assign(a->a, a->a+a->nz); 1229 ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 } else { 1231 MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format); 1232 try { 1233 cusparsestruct->nonzerorow=0; 1234 for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0); 1235 1236 if (a->compressedrow.use) { 1237 m = a->compressedrow.nrows; 1238 ii = a->compressedrow.i; 1239 ridx = a->compressedrow.rindex; 1240 } else { 1241 /* Forcing compressed row on the GPU */ 1242 int k=0; 1243 ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr); 1244 ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr); 1245 ii[0]=0; 1246 for (int j = 0; j<m; j++) { 1247 if ((a->i[j+1]-a->i[j])>0) { 1248 ii[k] = a->i[j]; 1249 ridx[k]= j; 1250 k++; 1251 } 1252 } 1253 ii[cusparsestruct->nonzerorow] = a->nz; 1254 m = cusparsestruct->nonzerorow; 1255 } 1256 1257 /* allocate space for the triangular factor information */ 1258 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 1259 stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat); 1260 stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat); 1261 stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat); 1262 1263 err = cudaMalloc((void **)&(matstruct->alpha), sizeof(PetscScalar));CHKERRCUDA(err); 1264 err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 1265 err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1266 err = cudaMemcpy(matstruct->alpha, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1267 err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1268 err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 1269 stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat); 1270 1271 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1272 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1273 /* set the matrix */ 1274 CsrMatrix *matrix= new CsrMatrix; 1275 matrix->num_rows = m; 1276 matrix->num_cols = A->cmap->n; 1277 matrix->num_entries = a->nz; 1278 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1279 matrix->row_offsets->assign(ii, ii + m+1); 1280 1281 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1282 matrix->column_indices->assign(a->j, a->j+a->nz); 1283 1284 matrix->values = new THRUSTARRAY(a->nz); 1285 matrix->values->assign(a->a, a->a+a->nz); 1286 1287 /* assign the pointer */ 1288 matstruct->mat = matrix; 1289 1290 } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1291 CsrMatrix *matrix= new CsrMatrix; 1292 matrix->num_rows = m; 1293 matrix->num_cols = A->cmap->n; 1294 matrix->num_entries = a->nz; 1295 matrix->row_offsets = new THRUSTINTARRAY32(m+1); 1296 matrix->row_offsets->assign(ii, ii + m+1); 1297 1298 matrix->column_indices = new THRUSTINTARRAY32(a->nz); 1299 matrix->column_indices->assign(a->j, a->j+a->nz); 1300 1301 matrix->values = new THRUSTARRAY(a->nz); 1302 matrix->values->assign(a->a, a->a+a->nz); 1303 1304 cusparseHybMat_t hybMat; 1305 stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat); 1306 cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1307 CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1308 stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols, 1309 matstruct->descr, matrix->values->data().get(), 1310 matrix->row_offsets->data().get(), 1311 matrix->column_indices->data().get(), 1312 hybMat, 0, partition);CHKERRCUDA(stat); 1313 /* assign the pointer */ 1314 matstruct->mat = hybMat; 1315 1316 if (matrix) { 1317 if (matrix->values) delete (THRUSTARRAY*)matrix->values; 1318 if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices; 1319 if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets; 1320 delete (CsrMatrix*)matrix; 1321 } 1322 } 1323 1324 /* assign the compressed row indices */ 1325 matstruct->cprowIndices = new THRUSTINTARRAY(m); 1326 matstruct->cprowIndices->assign(ridx,ridx+m); 1327 ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1328 1329 /* assign the pointer */ 1330 cusparsestruct->mat = matstruct; 1331 1332 if (!a->compressedrow.use) { 1333 ierr = PetscFree(ii);CHKERRQ(ierr); 1334 ierr = PetscFree(ridx);CHKERRQ(ierr); 1335 } 1336 cusparsestruct->workVector = new THRUSTARRAY(m); 1337 } catch(char *ex) { 1338 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1339 } 1340 cusparsestruct->nonzerostate = A->nonzerostate; 1341 } 1342 ierr = WaitForGPU();CHKERRCUDA(ierr); 1343 A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH; 1344 ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1345 } 1346 PetscFunctionReturn(0); 1347 } 1348 1349 struct VecCUDAPlusEquals 1350 { 1351 template <typename Tuple> 1352 __host__ __device__ 1353 void operator()(Tuple t) 1354 { 1355 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1356 } 1357 }; 1358 1359 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1360 { 1361 PetscErrorCode ierr; 1362 1363 PetscFunctionBegin; 1364 ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 1369 { 1370 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1371 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1372 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1373 const PetscScalar *xarray; 1374 PetscScalar *yarray; 1375 PetscErrorCode ierr; 1376 cusparseStatus_t stat; 1377 1378 PetscFunctionBegin; 1379 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1380 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1381 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1382 if (!matstructT) { 1383 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1384 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1385 } 1386 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1387 ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 1388 1389 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1390 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1391 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1392 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1393 mat->num_rows, mat->num_cols, 1394 mat->num_entries, matstructT->alpha, matstructT->descr, 1395 mat->values->data().get(), mat->row_offsets->data().get(), 1396 mat->column_indices->data().get(), xarray, matstructT->beta_zero, 1397 yarray);CHKERRCUDA(stat); 1398 } else { 1399 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1400 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1401 matstructT->alpha, matstructT->descr, hybMat, 1402 xarray, matstructT->beta_zero, 1403 yarray);CHKERRCUDA(stat); 1404 } 1405 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1406 ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 1407 ierr = WaitForGPU();CHKERRCUDA(ierr); 1408 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1409 ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr); 1410 PetscFunctionReturn(0); 1411 } 1412 1413 1414 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1415 { 1416 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1417 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1418 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 1419 const PetscScalar *xarray; 1420 PetscScalar *zarray,*dptr,*beta; 1421 PetscErrorCode ierr; 1422 cusparseStatus_t stat; 1423 PetscBool cmpr; /* if the matrix has been compressed (zero rows) */ 1424 1425 PetscFunctionBegin; 1426 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1427 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1428 matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1429 try { 1430 cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n)); 1431 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1432 if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */ 1433 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1434 } else { 1435 ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 1436 } 1437 dptr = cmpr ? zarray : cusparsestruct->workVector->data().get(); 1438 beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero; 1439 1440 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1441 /* csr_spmv is multiply add */ 1442 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1443 /* here we need to be careful to set the number of rows in the multiply to the 1444 number of compressed rows in the matrix ... which is equivalent to the 1445 size of the workVector */ 1446 CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 1447 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1448 mat->num_rows, mat->num_cols, 1449 mat->num_entries, matstruct->alpha, matstruct->descr, 1450 mat->values->data().get(), mat->row_offsets->data().get(), 1451 mat->column_indices->data().get(), xarray, beta, 1452 dptr);CHKERRCUDA(stat); 1453 } else { 1454 if (cusparsestruct->workVector->size()) { 1455 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 1456 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1457 matstruct->alpha, matstruct->descr, hybMat, 1458 xarray, beta, 1459 dptr);CHKERRCUDA(stat); 1460 } 1461 } 1462 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1463 1464 if (yy) { 1465 if (dptr != zarray) { 1466 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1467 } else if (zz != yy) { 1468 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1469 } 1470 } else if (dptr != zarray) { 1471 ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 1472 } 1473 /* scatter the data from the temporary into the full vector with a += operation */ 1474 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1475 if (dptr != zarray) { 1476 thrust::device_ptr<PetscScalar> zptr; 1477 1478 zptr = thrust::device_pointer_cast(zarray); 1479 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 1480 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(), 1481 VecCUDAPlusEquals()); 1482 } 1483 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1484 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1485 if (yy && !cmpr) { 1486 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1487 } else { 1488 ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 1489 } 1490 } catch(char *ex) { 1491 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1492 } 1493 if (!yy) { /* MatMult */ 1494 if (!cusparsestruct->stream) { 1495 ierr = WaitForGPU();CHKERRCUDA(ierr); 1496 } 1497 } 1498 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1499 PetscFunctionReturn(0); 1500 } 1501 1502 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 1503 { 1504 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1505 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1506 Mat_SeqAIJCUSPARSEMultStruct *matstructT; 1507 const PetscScalar *xarray; 1508 PetscScalar *zarray,*dptr,*beta; 1509 PetscErrorCode ierr; 1510 cusparseStatus_t stat; 1511 1512 PetscFunctionBegin; 1513 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 1514 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1515 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1516 if (!matstructT) { 1517 ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1518 matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1519 } 1520 1521 try { 1522 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1523 ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr); 1524 ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr); 1525 dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get(); 1526 beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero; 1527 1528 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1529 /* multiply add with matrix transpose */ 1530 if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1531 CsrMatrix *mat = (CsrMatrix*)matstructT->mat; 1532 /* here we need to be careful to set the number of rows in the multiply to the 1533 number of compressed rows in the matrix ... which is equivalent to the 1534 size of the workVector */ 1535 stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1536 mat->num_rows, mat->num_cols, 1537 mat->num_entries, matstructT->alpha, matstructT->descr, 1538 mat->values->data().get(), mat->row_offsets->data().get(), 1539 mat->column_indices->data().get(), xarray, beta, 1540 dptr);CHKERRCUDA(stat); 1541 } else { 1542 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat; 1543 if (cusparsestruct->workVector->size()) { 1544 stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 1545 matstructT->alpha, matstructT->descr, hybMat, 1546 xarray, beta, 1547 dptr);CHKERRCUDA(stat); 1548 } 1549 } 1550 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1551 1552 if (dptr != zarray) { 1553 ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); 1554 } else if (zz != yy) { 1555 ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); 1556 } 1557 /* scatter the data from the temporary into the full vector with a += operation */ 1558 ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1559 if (dptr != zarray) { 1560 thrust::device_ptr<PetscScalar> zptr; 1561 1562 zptr = thrust::device_pointer_cast(zarray); 1563 1564 /* scatter the data from the temporary into the full vector with a += operation */ 1565 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))), 1566 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n, 1567 VecCUDAPlusEquals()); 1568 } 1569 ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1570 ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 1571 ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr); 1572 } catch(char *ex) { 1573 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 1574 } 1575 ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */ 1576 ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 1581 { 1582 PetscErrorCode ierr; 1583 1584 PetscFunctionBegin; 1585 ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 1586 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 1587 if (A->factortype == MAT_FACTOR_NONE) { 1588 ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1589 } 1590 A->ops->mult = MatMult_SeqAIJCUSPARSE; 1591 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1592 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1593 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1594 PetscFunctionReturn(0); 1595 } 1596 1597 /* --------------------------------------------------------------------------------*/ 1598 /*@ 1599 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1600 (the default parallel PETSc format). This matrix will ultimately pushed down 1601 to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1602 assembly performance the user should preallocate the matrix storage by setting 1603 the parameter nz (or the array nnz). By setting these parameters accurately, 1604 performance during matrix assembly can be increased by more than a factor of 50. 1605 1606 Collective 1607 1608 Input Parameters: 1609 + comm - MPI communicator, set to PETSC_COMM_SELF 1610 . m - number of rows 1611 . n - number of columns 1612 . nz - number of nonzeros per row (same for all rows) 1613 - nnz - array containing the number of nonzeros in the various rows 1614 (possibly different for each row) or NULL 1615 1616 Output Parameter: 1617 . A - the matrix 1618 1619 It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1620 MatXXXXSetPreallocation() paradgm instead of this routine directly. 1621 [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1622 1623 Notes: 1624 If nnz is given then nz is ignored 1625 1626 The AIJ format (also called the Yale sparse matrix format or 1627 compressed row storage), is fully compatible with standard Fortran 77 1628 storage. That is, the stored row and column indices can begin at 1629 either one (as in Fortran) or zero. See the users' manual for details. 1630 1631 Specify the preallocated storage with either nz or nnz (not both). 1632 Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1633 allocation. For large problems you MUST preallocate memory or you 1634 will get TERRIBLE performance, see the users' manual chapter on matrices. 1635 1636 By default, this format uses inodes (identical nodes) when possible, to 1637 improve numerical efficiency of matrix-vector products and solves. We 1638 search for consecutive rows with the same nonzero structure, thereby 1639 reusing matrix information to achieve increased efficiency. 1640 1641 Level: intermediate 1642 1643 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 1644 @*/ 1645 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 1646 { 1647 PetscErrorCode ierr; 1648 1649 PetscFunctionBegin; 1650 ierr = MatCreate(comm,A);CHKERRQ(ierr); 1651 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 1652 ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1653 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 1654 PetscFunctionReturn(0); 1655 } 1656 1657 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 1658 { 1659 PetscErrorCode ierr; 1660 1661 PetscFunctionBegin; 1662 if (A->factortype==MAT_FACTOR_NONE) { 1663 if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) { 1664 ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 1665 } 1666 } else { 1667 ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 1668 } 1669 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 1670 PetscFunctionReturn(0); 1671 } 1672 1673 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 1674 { 1675 PetscErrorCode ierr; 1676 Mat C; 1677 cusparseStatus_t stat; 1678 cusparseHandle_t handle=0; 1679 1680 PetscFunctionBegin; 1681 ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 1682 C = *B; 1683 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr); 1684 ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr); 1685 1686 /* inject CUSPARSE-specific stuff */ 1687 if (C->factortype==MAT_FACTOR_NONE) { 1688 /* you cannot check the inode.use flag here since the matrix was just created. 1689 now build a GPU matrix data structure */ 1690 C->spptr = new Mat_SeqAIJCUSPARSE; 1691 ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat = 0; 1692 ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0; 1693 ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector = 0; 1694 ((Mat_SeqAIJCUSPARSE*)C->spptr)->format = MAT_CUSPARSE_CSR; 1695 ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream = 0; 1696 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1697 ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle = handle; 1698 ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0; 1699 } else { 1700 /* NEXT, set the pointers to the triangular factors */ 1701 C->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1702 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr = 0; 1703 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr = 0; 1704 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0; 1705 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0; 1706 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices = 0; 1707 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices = 0; 1708 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector = 0; 1709 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = 0; 1710 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1711 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle = handle; 1712 ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz = 0; 1713 } 1714 1715 C->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1716 C->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1717 C->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1718 C->ops->mult = MatMult_SeqAIJCUSPARSE; 1719 C->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1720 C->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1721 C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1722 C->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1723 1724 ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1725 1726 C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1727 1728 ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1729 PetscFunctionReturn(0); 1730 } 1731 1732 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B) 1733 { 1734 PetscErrorCode ierr; 1735 cusparseStatus_t stat; 1736 cusparseHandle_t handle=0; 1737 1738 PetscFunctionBegin; 1739 ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 1740 ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 1741 1742 if (B->factortype==MAT_FACTOR_NONE) { 1743 /* you cannot check the inode.use flag here since the matrix was just created. 1744 now build a GPU matrix data structure */ 1745 B->spptr = new Mat_SeqAIJCUSPARSE; 1746 ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat = 0; 1747 ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0; 1748 ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector = 0; 1749 ((Mat_SeqAIJCUSPARSE*)B->spptr)->format = MAT_CUSPARSE_CSR; 1750 ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream = 0; 1751 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1752 ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle = handle; 1753 ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0; 1754 } else { 1755 /* NEXT, set the pointers to the triangular factors */ 1756 B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 1757 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr = 0; 1758 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr = 0; 1759 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0; 1760 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0; 1761 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices = 0; 1762 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices = 0; 1763 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector = 0; 1764 stat = cusparseCreate(&handle);CHKERRCUDA(stat); 1765 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle = handle; 1766 ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz = 0; 1767 } 1768 1769 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 1770 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 1771 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 1772 B->ops->mult = MatMult_SeqAIJCUSPARSE; 1773 B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 1774 B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 1775 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 1776 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 1777 1778 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1779 1780 B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED; 1781 1782 ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 1783 PetscFunctionReturn(0); 1784 } 1785 1786 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 1787 { 1788 PetscErrorCode ierr; 1789 1790 PetscFunctionBegin; 1791 ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 1792 ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr); 1793 PetscFunctionReturn(0); 1794 } 1795 1796 /*MC 1797 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 1798 1799 A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 1800 CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 1801 All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 1802 1803 Options Database Keys: 1804 + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 1805 . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1806 - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 1807 1808 Level: beginner 1809 1810 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 1811 M*/ 1812 1813 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 1814 1815 1816 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 1817 { 1818 PetscErrorCode ierr; 1819 1820 PetscFunctionBegin; 1821 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1822 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1823 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1824 ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 1825 PetscFunctionReturn(0); 1826 } 1827 1828 1829 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 1830 { 1831 cusparseStatus_t stat; 1832 cusparseHandle_t handle; 1833 1834 PetscFunctionBegin; 1835 if (*cusparsestruct) { 1836 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format); 1837 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format); 1838 delete (*cusparsestruct)->workVector; 1839 if (handle = (*cusparsestruct)->handle) { 1840 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1841 } 1842 delete *cusparsestruct; 1843 *cusparsestruct = 0; 1844 } 1845 PetscFunctionReturn(0); 1846 } 1847 1848 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 1849 { 1850 PetscFunctionBegin; 1851 if (*mat) { 1852 delete (*mat)->values; 1853 delete (*mat)->column_indices; 1854 delete (*mat)->row_offsets; 1855 delete *mat; 1856 *mat = 0; 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 1862 { 1863 cusparseStatus_t stat; 1864 PetscErrorCode ierr; 1865 1866 PetscFunctionBegin; 1867 if (*trifactor) { 1868 if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); } 1869 if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); } 1870 ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 1871 delete *trifactor; 1872 *trifactor = 0; 1873 } 1874 PetscFunctionReturn(0); 1875 } 1876 1877 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 1878 { 1879 CsrMatrix *mat; 1880 cusparseStatus_t stat; 1881 cudaError_t err; 1882 1883 PetscFunctionBegin; 1884 if (*matstruct) { 1885 if ((*matstruct)->mat) { 1886 if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 1887 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 1888 stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat); 1889 } else { 1890 mat = (CsrMatrix*)(*matstruct)->mat; 1891 CsrMatrix_Destroy(&mat); 1892 } 1893 } 1894 if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); } 1895 delete (*matstruct)->cprowIndices; 1896 if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); } 1897 if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 1898 if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 1899 delete *matstruct; 1900 *matstruct = 0; 1901 } 1902 PetscFunctionReturn(0); 1903 } 1904 1905 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 1906 { 1907 cusparseHandle_t handle; 1908 cusparseStatus_t stat; 1909 1910 PetscFunctionBegin; 1911 if (*trifactors) { 1912 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr); 1913 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr); 1914 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose); 1915 MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose); 1916 delete (*trifactors)->rpermIndices; 1917 delete (*trifactors)->cpermIndices; 1918 delete (*trifactors)->workVector; 1919 if (handle = (*trifactors)->handle) { 1920 stat = cusparseDestroy(handle);CHKERRCUDA(stat); 1921 } 1922 delete *trifactors; 1923 *trifactors = 0; 1924 } 1925 PetscFunctionReturn(0); 1926 } 1927 1928