19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 39ae82921SPaul Mullowney matrix storage format. 49ae82921SPaul Mullowney */ 59ae82921SPaul Mullowney 69ae82921SPaul Mullowney #include "petscconf.h" 79ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_BEGIN 89ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h" /*I "petscmat.h" I*/ 99ae82921SPaul Mullowney //#include "petscbt.h" 109ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h" 119ae82921SPaul Mullowney #include "petsc-private/vecimpl.h" 129ae82921SPaul Mullowney PETSC_CUDA_EXTERN_C_END 139ae82921SPaul Mullowney #undef VecType 149ae82921SPaul Mullowney #include "cusparsematimpl.h" 159ae82921SPaul Mullowney 16ca45077fSPaul Mullowney // this is such a hack ... but I don't know of another way to pass this variable 17ca45077fSPaul Mullowney // from one GPU_Matrix_Ifc class to another. This is necessary for the parallel 18ca45077fSPaul Mullowney // SpMV. Essentially, I need to use the same stream variable in two different 19ca45077fSPaul Mullowney // data structures. I do this by creating a single instance of that stream 20ca45077fSPaul Mullowney // and reuse it. 21ca45077fSPaul Mullowney cudaStream_t theBodyStream=0; 229ae82921SPaul Mullowney 239ae82921SPaul Mullowney EXTERN_C_BEGIN 249ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 259ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 269ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo *); 279ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 289ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 299ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat); 309ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat); 318dc1d2a3SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 328dc1d2a3SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 338dc1d2a3SPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 348dc1d2a3SPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 359ae82921SPaul Mullowney EXTERN_C_END 369ae82921SPaul Mullowney 379ae82921SPaul Mullowney EXTERN_C_BEGIN 389ae82921SPaul Mullowney #undef __FUNCT__ 399ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse" 409ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type) 419ae82921SPaul Mullowney { 429ae82921SPaul Mullowney PetscFunctionBegin; 439ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 449ae82921SPaul Mullowney PetscFunctionReturn(0); 459ae82921SPaul Mullowney } 469ae82921SPaul Mullowney EXTERN_C_END 479ae82921SPaul Mullowney 489ae82921SPaul Mullowney EXTERN_C_BEGIN 499ae82921SPaul Mullowney extern PetscErrorCode MatGetFactor_seqaij_petsc(Mat,MatFactorType,Mat*); 509ae82921SPaul Mullowney EXTERN_C_END 519ae82921SPaul Mullowney 529ae82921SPaul Mullowney 539ae82921SPaul Mullowney 549ae82921SPaul Mullowney /*MC 559ae82921SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers (ILU) for seq matrices 569ae82921SPaul Mullowney on the GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp 578dc1d2a3SPaul Mullowney 5~ 589ae82921SPaul Mullowney ./configure --download-txpetscgpu to install PETSc to use CUSPARSE solves 599ae82921SPaul Mullowney 609ae82921SPaul Mullowney Options Database Keys: 61ca45077fSPaul Mullowney + -mat_mult_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 62ca45077fSPaul Mullowney + -mat_solve_cusparse_storage_format <CSR> - (choose one of) CSR, ELL, HYB 639ae82921SPaul Mullowney 649ae82921SPaul Mullowney Level: beginner 659ae82921SPaul Mullowney M*/ 669ae82921SPaul Mullowney 679ae82921SPaul Mullowney EXTERN_C_BEGIN 689ae82921SPaul Mullowney #undef __FUNCT__ 699ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse" 709ae82921SPaul Mullowney PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B) 719ae82921SPaul Mullowney { 729ae82921SPaul Mullowney PetscErrorCode ierr; 739ae82921SPaul Mullowney 749ae82921SPaul Mullowney PetscFunctionBegin; 759ae82921SPaul Mullowney ierr = MatGetFactor_seqaij_petsc(A,ftype,B);CHKERRQ(ierr); 769ae82921SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){ 779ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 789ae82921SPaul Mullowney ierr = MatSetFromOptions_SeqAIJCUSPARSE(*B);CHKERRQ(ierr); 799ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)(*B),"MatFactorGetSolverPackage_C","MatFactorGetSolverPackage_seqaij_cusparse",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr); 809ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 819ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 829ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 839ae82921SPaul Mullowney (*B)->factortype = ftype; 849ae82921SPaul Mullowney PetscFunctionReturn(0); 859ae82921SPaul Mullowney } 869ae82921SPaul Mullowney EXTERN_C_END 879ae82921SPaul Mullowney 889ae82921SPaul Mullowney #undef __FUNCT__ 899ae82921SPaul Mullowney #define __FUNCT__ "MatAIJCUSPARSESetGPUStorageFormatForMatSolve" 90ca45077fSPaul Mullowney PetscErrorCode MatAIJCUSPARSESetGPUStorageFormatForMatSolve(Mat A,GPUStorageFormat format) 919ae82921SPaul Mullowney { 929ae82921SPaul Mullowney PetscErrorCode ierr; 939ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 949ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 959ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 969ae82921SPaul Mullowney PetscFunctionBegin; 979ae82921SPaul Mullowney cusparseTriFactors->format = format; 989ae82921SPaul Mullowney if (cusparseMatLo && cusparseMatUp) { 999ae82921SPaul Mullowney // If data exists already delete 1009ae82921SPaul Mullowney if (cusparseMatLo) 1019ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 1029ae82921SPaul Mullowney if (cusparseMatUp) 1039ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 1049ae82921SPaul Mullowney 1059ae82921SPaul Mullowney // key data was deleted so reset the flag. 1069ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 1079ae82921SPaul Mullowney 1089ae82921SPaul Mullowney // rebuild matrix 1099ae82921SPaul Mullowney ierr=MatCUSPARSEAnalysisAndCopyToGPU(A);CHKERRQ(ierr); 1109ae82921SPaul Mullowney } 1119ae82921SPaul Mullowney PetscFunctionReturn(0); 1129ae82921SPaul Mullowney } 1139ae82921SPaul Mullowney 114ca45077fSPaul Mullowney //EXTERN_C_BEGIN 115ca45077fSPaul Mullowney #undef __FUNCT__ 116ca45077fSPaul Mullowney #define __FUNCT__ "MatSetOption_SeqAIJCUSPARSE" 117ca45077fSPaul Mullowney PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 118ca45077fSPaul Mullowney { 119ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 120ca45077fSPaul Mullowney PetscErrorCode ierr; 121ca45077fSPaul Mullowney PetscFunctionBegin; 122ca45077fSPaul Mullowney ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 123ca45077fSPaul Mullowney switch (op) { 124bbf3fe20SPaul Mullowney case MAT_DIAGBLOCK_CSR: 125bbf3fe20SPaul Mullowney case MAT_OFFDIAGBLOCK_CSR: 126ca45077fSPaul Mullowney case MAT_CSR: 127ca45077fSPaul Mullowney cusparseMat->format = CSR; 128ca45077fSPaul Mullowney break; 129bbf3fe20SPaul Mullowney case MAT_DIAGBLOCK_DIA: 130bbf3fe20SPaul Mullowney case MAT_OFFDIAGBLOCK_DIA: 131ca45077fSPaul Mullowney case MAT_DIA: 132ca45077fSPaul Mullowney SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Unsupported GPU matrix storage format DIA for (MPI,SEQ)AIJCUSPARSE matrix type."); 133bbf3fe20SPaul Mullowney case MAT_DIAGBLOCK_ELL: 134bbf3fe20SPaul Mullowney case MAT_OFFDIAGBLOCK_ELL: 135ca45077fSPaul Mullowney case MAT_ELL: 136ca45077fSPaul Mullowney cusparseMat->format = ELL; 137ca45077fSPaul Mullowney break; 138bbf3fe20SPaul Mullowney case MAT_DIAGBLOCK_HYB: 139bbf3fe20SPaul Mullowney case MAT_OFFDIAGBLOCK_HYB: 140ca45077fSPaul Mullowney case MAT_HYB: 141ca45077fSPaul Mullowney cusparseMat->format = HYB; 142ca45077fSPaul Mullowney break; 143ca45077fSPaul Mullowney default: 144ca45077fSPaul Mullowney break; 145ca45077fSPaul Mullowney } 146ca45077fSPaul Mullowney PetscFunctionReturn(0); 147ca45077fSPaul Mullowney } 148ca45077fSPaul Mullowney //EXTERN_C_END 1499ae82921SPaul Mullowney 150ca45077fSPaul Mullowney //EXTERN_C_BEGIN 1519ae82921SPaul Mullowney #undef __FUNCT__ 1529ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE" 1539ae82921SPaul Mullowney PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A) 1549ae82921SPaul Mullowney { 1559ae82921SPaul Mullowney PetscErrorCode ierr; 1569ae82921SPaul Mullowney PetscInt idx=0; 157ca45077fSPaul Mullowney char * formats[] ={CSR,ELL,HYB}; 158ca45077fSPaul Mullowney MatOption format; 1599ae82921SPaul Mullowney PetscBool flg; 1609ae82921SPaul Mullowney PetscFunctionBegin; 1619ae82921SPaul Mullowney ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"When using TxPETSCGPU, AIJCUSPARSE Options","Mat");CHKERRQ(ierr); 1629ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 1639ae82921SPaul Mullowney ierr = PetscOptionsEList("-mat_mult_cusparse_storage_format", 1649ae82921SPaul Mullowney "Set the storage format of (seq)aijcusparse gpu matrices for SpMV", 165ca45077fSPaul Mullowney "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 166*045c96e1SPaul Mullowney switch (idx) 167*045c96e1SPaul Mullowney { 168*045c96e1SPaul Mullowney case 0: 169ca45077fSPaul Mullowney format=MAT_CSR; 170*045c96e1SPaul Mullowney break; 171*045c96e1SPaul Mullowney case 2: 172ca45077fSPaul Mullowney format=MAT_ELL; 173*045c96e1SPaul Mullowney break; 174*045c96e1SPaul Mullowney case 3: 175ca45077fSPaul Mullowney format=MAT_HYB; 176*045c96e1SPaul Mullowney break; 177*045c96e1SPaul Mullowney } 178ca45077fSPaul Mullowney ierr=MatSetOption_SeqAIJCUSPARSE(A,format,PETSC_TRUE);CHKERRQ(ierr); 1799ae82921SPaul Mullowney } 1809ae82921SPaul Mullowney else { 1819ae82921SPaul Mullowney ierr = PetscOptionsEList("-mat_solve_cusparse_storage_format", 1829ae82921SPaul Mullowney "Set the storage format of (seq)aijcusparse gpu matrices for TriSolve", 183ca45077fSPaul Mullowney "None",formats,3,formats[0],&idx,&flg);CHKERRQ(ierr); 1849ae82921SPaul Mullowney ierr=MatAIJCUSPARSESetGPUStorageFormatForMatSolve(A,formats[idx]);CHKERRQ(ierr); 1859ae82921SPaul Mullowney } 1869ae82921SPaul Mullowney ierr = PetscOptionsEnd();CHKERRQ(ierr); 1879ae82921SPaul Mullowney PetscFunctionReturn(0); 1889ae82921SPaul Mullowney 1899ae82921SPaul Mullowney } 190ca45077fSPaul Mullowney //EXTERN_C_END 1919ae82921SPaul Mullowney 1929ae82921SPaul Mullowney #undef __FUNCT__ 1939ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE" 1949ae82921SPaul Mullowney PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1959ae82921SPaul Mullowney { 1969ae82921SPaul Mullowney PetscErrorCode ierr; 1979ae82921SPaul Mullowney 1989ae82921SPaul Mullowney PetscFunctionBegin; 1999ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2009ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2019ae82921SPaul Mullowney PetscFunctionReturn(0); 2029ae82921SPaul Mullowney } 2039ae82921SPaul Mullowney 2049ae82921SPaul Mullowney #undef __FUNCT__ 2059ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE" 2069ae82921SPaul Mullowney PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2079ae82921SPaul Mullowney { 2089ae82921SPaul Mullowney PetscErrorCode ierr; 2099ae82921SPaul Mullowney 2109ae82921SPaul Mullowney PetscFunctionBegin; 2119ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2129ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2139ae82921SPaul Mullowney PetscFunctionReturn(0); 2149ae82921SPaul Mullowney } 2159ae82921SPaul Mullowney 2169ae82921SPaul Mullowney #undef __FUNCT__ 2179ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildLowerTriMatrix" 2189ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildLowerTriMatrix(Mat A) 2199ae82921SPaul Mullowney { 2209ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2219ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2229ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2239ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 2249ae82921SPaul Mullowney cusparseStatus_t stat; 2259ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2269ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2279ae82921SPaul Mullowney PetscErrorCode ierr; 2289ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2299ae82921SPaul Mullowney PetscScalar *AALo; 2309ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2319ae82921SPaul Mullowney 2329ae82921SPaul Mullowney PetscFunctionBegin; 2339ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 2349ae82921SPaul Mullowney try { 2359ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2369ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 2379ae82921SPaul Mullowney 2389ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2399ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 2409ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr); 2419ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr); 2429ae82921SPaul Mullowney 2439ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2449ae82921SPaul Mullowney AiLo[0]=(PetscInt) 0; 2459ae82921SPaul Mullowney AiLo[n]=nzLower; 2469ae82921SPaul Mullowney AjLo[0]=(PetscInt) 0; 2479ae82921SPaul Mullowney AALo[0]=(MatScalar) 1.0; 2489ae82921SPaul Mullowney v = aa; 2499ae82921SPaul Mullowney vi = aj; 2509ae82921SPaul Mullowney offset=1; 2519ae82921SPaul Mullowney rowOffset=1; 2529ae82921SPaul Mullowney for (i=1; i<n; i++) { 2539ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 2549ae82921SPaul Mullowney // additional 1 for the term on the diagonal 2559ae82921SPaul Mullowney AiLo[i]=rowOffset; 2569ae82921SPaul Mullowney rowOffset+=nz+1; 2579ae82921SPaul Mullowney 2589ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 2599ae82921SPaul Mullowney ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 2609ae82921SPaul Mullowney 2619ae82921SPaul Mullowney offset+=nz; 2629ae82921SPaul Mullowney AjLo[offset]=(PetscInt) i; 2639ae82921SPaul Mullowney AALo[offset]=(MatScalar) 1.0; 2649ae82921SPaul Mullowney offset+=1; 2659ae82921SPaul Mullowney 2669ae82921SPaul Mullowney v += nz; 2679ae82921SPaul Mullowney vi += nz; 2689ae82921SPaul Mullowney } 2699ae82921SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 2709ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat); 2719ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzLower, AiLo, AjLo, AALo);CHKERRCUSP(ierr); 2729ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 2739ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = cusparseMat; 2749ae82921SPaul Mullowney // free temporary data 2759ae82921SPaul Mullowney ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr); 2769ae82921SPaul Mullowney ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr); 2779ae82921SPaul Mullowney ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr); 2789ae82921SPaul Mullowney } catch(char* ex) { 2799ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 2809ae82921SPaul Mullowney } 2819ae82921SPaul Mullowney } 2829ae82921SPaul Mullowney PetscFunctionReturn(0); 2839ae82921SPaul Mullowney } 2849ae82921SPaul Mullowney 2859ae82921SPaul Mullowney #undef __FUNCT__ 2869ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEBuildUpperTriMatrix" 2879ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEBuildUpperTriMatrix(Mat A) 2889ae82921SPaul Mullowney { 2899ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2909ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2919ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 2929ae82921SPaul Mullowney GPU_Matrix_Ifc* cusparseMat = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 2939ae82921SPaul Mullowney cusparseStatus_t stat; 2949ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 2959ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2969ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 2979ae82921SPaul Mullowney PetscScalar *AAUp; 2989ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 2999ae82921SPaul Mullowney PetscErrorCode ierr; 3009ae82921SPaul Mullowney 3019ae82921SPaul Mullowney PetscFunctionBegin; 3029ae82921SPaul Mullowney 3039ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 3049ae82921SPaul Mullowney try { 3059ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3069ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 3079ae82921SPaul Mullowney 3089ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 3099ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr); 3109ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr); 3119ae82921SPaul Mullowney ierr = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr); 3129ae82921SPaul Mullowney 3139ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 3149ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 3159ae82921SPaul Mullowney AiUp[n]=nzUpper; 3169ae82921SPaul Mullowney offset = nzUpper; 3179ae82921SPaul Mullowney for (i=n-1; i>=0; i--){ 3189ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 3199ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 3209ae82921SPaul Mullowney 3219ae82921SPaul Mullowney // number of elements NOT on the diagonal 3229ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 3239ae82921SPaul Mullowney 3249ae82921SPaul Mullowney // decrement the offset 3259ae82921SPaul Mullowney offset -= (nz+1); 3269ae82921SPaul Mullowney 3279ae82921SPaul Mullowney // first, set the diagonal elements 3289ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 3299ae82921SPaul Mullowney AAUp[offset] = 1./v[nz]; 3309ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 3319ae82921SPaul Mullowney 3329ae82921SPaul Mullowney ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr); 3339ae82921SPaul Mullowney ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr); 3349ae82921SPaul Mullowney } 3359ae82921SPaul Mullowney cusparseMat = GPU_Matrix_Factory::getNew(cusparseTriFactors->format); 3369ae82921SPaul Mullowney stat = cusparseMat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_TRIANGULAR, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 3379ae82921SPaul Mullowney ierr = cusparseMat->setMatrix(n, n, nzUpper, AiUp, AjUp, AAUp);CHKERRCUSP(ierr); 3389ae82921SPaul Mullowney stat = cusparseMat->solveAnalysis();CHKERRCUSP(stat); 3399ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = cusparseMat; 3409ae82921SPaul Mullowney // free temporary data 3419ae82921SPaul Mullowney ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr); 3429ae82921SPaul Mullowney ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr); 3439ae82921SPaul Mullowney ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr); 3449ae82921SPaul Mullowney } catch(char* ex) { 3459ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 3469ae82921SPaul Mullowney } 3479ae82921SPaul Mullowney } 3489ae82921SPaul Mullowney PetscFunctionReturn(0); 3499ae82921SPaul Mullowney } 3509ae82921SPaul Mullowney 3519ae82921SPaul Mullowney #undef __FUNCT__ 3529ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSEAnalysiAndCopyToGPU" 3539ae82921SPaul Mullowney PetscErrorCode MatCUSPARSEAnalysisAndCopyToGPU(Mat A) 3549ae82921SPaul Mullowney { 3559ae82921SPaul Mullowney PetscErrorCode ierr; 3569ae82921SPaul Mullowney Mat_SeqAIJ *a=(Mat_SeqAIJ *)A->data; 3579ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 3589ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 3599ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3609ae82921SPaul Mullowney const PetscInt *r,*c; 3619ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3629ae82921SPaul Mullowney 3639ae82921SPaul Mullowney PetscFunctionBegin; 3649ae82921SPaul Mullowney ierr = MatCUSPARSEBuildLowerTriMatrix(A);CHKERRQ(ierr); 3659ae82921SPaul Mullowney ierr = MatCUSPARSEBuildUpperTriMatrix(A);CHKERRQ(ierr); 3669ae82921SPaul Mullowney cusparseTriFactors->tempvec = new CUSPARRAY; 3679ae82921SPaul Mullowney cusparseTriFactors->tempvec->resize(n); 3689ae82921SPaul Mullowney 3699ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 3709ae82921SPaul Mullowney //lower triangular indices 3719ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3729ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3739ae82921SPaul Mullowney if (!row_identity) 3749ae82921SPaul Mullowney ierr = cusparseTriFactors->loTriFactorPtr->setOrdIndices(r, n);CHKERRCUSP(ierr); 3759ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3769ae82921SPaul Mullowney 3779ae82921SPaul Mullowney //upper triangular indices 3789ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 3799ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3809ae82921SPaul Mullowney if (!col_identity) 3819ae82921SPaul Mullowney ierr = cusparseTriFactors->upTriFactorPtr->setOrdIndices(c, n);CHKERRCUSP(ierr); 3829ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 3839ae82921SPaul Mullowney PetscFunctionReturn(0); 3849ae82921SPaul Mullowney } 3859ae82921SPaul Mullowney 3869ae82921SPaul Mullowney #undef __FUNCT__ 3879ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE" 3889ae82921SPaul Mullowney PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 3899ae82921SPaul Mullowney { 3909ae82921SPaul Mullowney PetscErrorCode ierr; 3919ae82921SPaul Mullowney Mat_SeqAIJ *b=(Mat_SeqAIJ *)B->data; 3929ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 3939ae82921SPaul Mullowney PetscBool row_identity,col_identity; 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney PetscFunctionBegin; 3969ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 3979ae82921SPaul Mullowney // determine which version of MatSolve needs to be used. 3989ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3999ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4009ae82921SPaul Mullowney if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 4019ae82921SPaul Mullowney else B->ops->solve = MatSolve_SeqAIJCUSPARSE; 4028dc1d2a3SPaul Mullowney 403ca45077fSPaul Mullowney //if (!row_identity) printf("Row permutations exist!"); 404ca45077fSPaul Mullowney //if (!col_identity) printf("Col permutations exist!"); 4059ae82921SPaul Mullowney 4069ae82921SPaul Mullowney // get the triangular factors 4079ae82921SPaul Mullowney ierr = MatCUSPARSEAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 4089ae82921SPaul Mullowney PetscFunctionReturn(0); 4099ae82921SPaul Mullowney } 4109ae82921SPaul Mullowney 4119ae82921SPaul Mullowney 4129ae82921SPaul Mullowney 4139ae82921SPaul Mullowney #undef __FUNCT__ 4149ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE" 4159ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 4169ae82921SPaul Mullowney { 4179ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4189ae82921SPaul Mullowney PetscErrorCode ierr; 4199ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 4209ae82921SPaul Mullowney cusparseStatus_t stat; 4219ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4229ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4239ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4249ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4259ae82921SPaul Mullowney 4269ae82921SPaul Mullowney PetscFunctionBegin; 4279ae82921SPaul Mullowney // Get the GPU pointers 4289ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4299ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4309ae82921SPaul Mullowney 4319ae82921SPaul Mullowney // solve with reordering 4329ae82921SPaul Mullowney ierr = cusparseMatLo->reorderIn(xGPU, bGPU);CHKERRCUSP(ierr); 4339ae82921SPaul Mullowney stat = cusparseMatLo->solve(xGPU, tempGPU);CHKERRCUSP(stat); 4349ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4359ae82921SPaul Mullowney ierr = cusparseMatUp->reorderOut(xGPU);CHKERRCUSP(ierr); 4369ae82921SPaul Mullowney 4379ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4389ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4399ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4409ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4419ae82921SPaul Mullowney PetscFunctionReturn(0); 4429ae82921SPaul Mullowney } 4439ae82921SPaul Mullowney 4449ae82921SPaul Mullowney 4459ae82921SPaul Mullowney 4469ae82921SPaul Mullowney 4479ae82921SPaul Mullowney #undef __FUNCT__ 4489ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering" 4499ae82921SPaul Mullowney PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 4509ae82921SPaul Mullowney { 4519ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4529ae82921SPaul Mullowney PetscErrorCode ierr; 4539ae82921SPaul Mullowney CUSPARRAY *xGPU, *bGPU; 4549ae82921SPaul Mullowney cusparseStatus_t stat; 4559ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 4569ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 4579ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 4589ae82921SPaul Mullowney CUSPARRAY * tempGPU = (CUSPARRAY*) cusparseTriFactors->tempvec; 4599ae82921SPaul Mullowney 4609ae82921SPaul Mullowney PetscFunctionBegin; 4619ae82921SPaul Mullowney // Get the GPU pointers 4629ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4639ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr); 4649ae82921SPaul Mullowney 4659ae82921SPaul Mullowney // solve 4669ae82921SPaul Mullowney stat = cusparseMatLo->solve(bGPU, tempGPU);CHKERRCUSP(stat); 4679ae82921SPaul Mullowney stat = cusparseMatUp->solve(tempGPU, xGPU);CHKERRCUSP(stat); 4689ae82921SPaul Mullowney 4699ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr); 4709ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr); 4719ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 4729ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 4739ae82921SPaul Mullowney PetscFunctionReturn(0); 4749ae82921SPaul Mullowney } 4759ae82921SPaul Mullowney 4769ae82921SPaul Mullowney #undef __FUNCT__ 4779ae82921SPaul Mullowney #define __FUNCT__ "MatCUSPARSECopyToGPU" 4789ae82921SPaul Mullowney PetscErrorCode MatCUSPARSECopyToGPU(Mat A) 4799ae82921SPaul Mullowney { 4809ae82921SPaul Mullowney 4819ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 4829ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4839ae82921SPaul Mullowney PetscInt m = A->rmap->n,*ii,*ridx; 4849ae82921SPaul Mullowney PetscErrorCode ierr; 4859ae82921SPaul Mullowney 4869ae82921SPaul Mullowney 4879ae82921SPaul Mullowney PetscFunctionBegin; 4889ae82921SPaul Mullowney if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){ 4899ae82921SPaul Mullowney ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 4909ae82921SPaul Mullowney /* 4919ae82921SPaul Mullowney It may be possible to reuse nonzero structure with new matrix values but 4929ae82921SPaul Mullowney for simplicity and insured correctness we delete and build a new matrix on 4939ae82921SPaul Mullowney the GPU. Likely a very small performance hit. 4949ae82921SPaul Mullowney */ 4959ae82921SPaul Mullowney if (cusparseMat->mat){ 4969ae82921SPaul Mullowney try { 4979ae82921SPaul Mullowney delete cusparseMat->mat; 4989ae82921SPaul Mullowney if (cusparseMat->tempvec) 4999ae82921SPaul Mullowney delete cusparseMat->tempvec; 5009ae82921SPaul Mullowney 5019ae82921SPaul Mullowney } catch(char* ex) { 5029ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5039ae82921SPaul Mullowney } 5049ae82921SPaul Mullowney } 5059ae82921SPaul Mullowney try { 5069ae82921SPaul Mullowney cusparseMat->nonzerorow=0; 5079ae82921SPaul Mullowney for (int j = 0; j<m; j++) 5089ae82921SPaul Mullowney cusparseMat->nonzerorow += ((a->i[j+1]-a->i[j])>0); 5099ae82921SPaul Mullowney 5109ae82921SPaul Mullowney if (a->compressedrow.use) { 5119ae82921SPaul Mullowney m = a->compressedrow.nrows; 5129ae82921SPaul Mullowney ii = a->compressedrow.i; 5139ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 5149ae82921SPaul Mullowney } else { 5159ae82921SPaul Mullowney // Forcing compressed row on the GPU ... only relevant for CSR storage 5169ae82921SPaul Mullowney int k=0; 5179ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr); 5189ae82921SPaul Mullowney ierr = PetscMalloc((cusparseMat->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr); 5199ae82921SPaul Mullowney ii[0]=0; 5209ae82921SPaul Mullowney for (int j = 0; j<m; j++) { 5219ae82921SPaul Mullowney if ((a->i[j+1]-a->i[j])>0) { 5229ae82921SPaul Mullowney ii[k] = a->i[j]; 5239ae82921SPaul Mullowney ridx[k]= j; 5249ae82921SPaul Mullowney k++; 5259ae82921SPaul Mullowney } 5269ae82921SPaul Mullowney } 5279ae82921SPaul Mullowney ii[cusparseMat->nonzerorow] = a->nz; 5289ae82921SPaul Mullowney m = cusparseMat->nonzerorow; 5299ae82921SPaul Mullowney } 5309ae82921SPaul Mullowney 5319ae82921SPaul Mullowney // Build our matrix ... first determine the GPU storage type 5329ae82921SPaul Mullowney cusparseMat->mat = GPU_Matrix_Factory::getNew(cusparseMat->format); 5339ae82921SPaul Mullowney 5349ae82921SPaul Mullowney // Create the streams and events (if desired). 5359ae82921SPaul Mullowney PetscMPIInt size; 5369ae82921SPaul Mullowney ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 5379ae82921SPaul Mullowney ierr = cusparseMat->mat->buildStreamsAndEvents(size, &theBodyStream);CHKERRCUSP(ierr); 5389ae82921SPaul Mullowney 539ca45077fSPaul Mullowney // FILL MODE UPPER is irrelevant 540ca45077fSPaul Mullowney cusparseStatus_t stat = cusparseMat->mat->initializeCusparse(MAT_cusparseHandle, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat); 541ca45077fSPaul Mullowney 5429ae82921SPaul Mullowney // lastly, build the matrix 5439ae82921SPaul Mullowney ierr = cusparseMat->mat->setMatrix(m, A->cmap->n, a->nz, ii, a->j, a->a);CHKERRCUSP(ierr); 5449ae82921SPaul Mullowney cusparseMat->mat->setCPRowIndices(ridx, m); 5459ae82921SPaul Mullowney // } 5469ae82921SPaul Mullowney if (!a->compressedrow.use) { 5479ae82921SPaul Mullowney // free data 5489ae82921SPaul Mullowney ierr = PetscFree(ii);CHKERRQ(ierr); 5499ae82921SPaul Mullowney ierr = PetscFree(ridx);CHKERRQ(ierr); 5509ae82921SPaul Mullowney } 5519ae82921SPaul Mullowney cusparseMat->tempvec = new CUSPARRAY; 5529ae82921SPaul Mullowney cusparseMat->tempvec->resize(m); 553ca45077fSPaul Mullowney //A->spptr = cusparseMat; 5549ae82921SPaul Mullowney } catch(char* ex) { 5559ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5569ae82921SPaul Mullowney } 5579ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 5589ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_BOTH; 5599ae82921SPaul Mullowney ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 5609ae82921SPaul Mullowney } 5619ae82921SPaul Mullowney PetscFunctionReturn(0); 5629ae82921SPaul Mullowney } 5639ae82921SPaul Mullowney 5649ae82921SPaul Mullowney // #undef __FUNCT__ 5659ae82921SPaul Mullowney // #define __FUNCT__ "MatCUSPARSECopyFromGPU" 5669ae82921SPaul Mullowney // PetscErrorCode MatCUSPARSECopyFromGPU(Mat A, CUSPMATRIX *Agpu) 5679ae82921SPaul Mullowney // { 5689ae82921SPaul Mullowney // Mat_SeqAIJCUSPARSE *cuspstruct = (Mat_SeqAIJCUSPARSE *) A->spptr; 5699ae82921SPaul Mullowney // Mat_SeqAIJ *a = (Mat_SeqAIJ *) A->data; 5709ae82921SPaul Mullowney // PetscInt m = A->rmap->n; 5719ae82921SPaul Mullowney // PetscErrorCode ierr; 5729ae82921SPaul Mullowney 5739ae82921SPaul Mullowney // PetscFunctionBegin; 5749ae82921SPaul Mullowney // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 5759ae82921SPaul Mullowney // if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) { 5769ae82921SPaul Mullowney // try { 5779ae82921SPaul Mullowney // cuspstruct->mat = Agpu; 5789ae82921SPaul Mullowney // if (a->compressedrow.use) { 5799ae82921SPaul Mullowney // //PetscInt *ii, *ridx; 5809ae82921SPaul Mullowney // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices"); 5819ae82921SPaul Mullowney // } else { 5829ae82921SPaul Mullowney // PetscInt i; 5839ae82921SPaul Mullowney 5849ae82921SPaul Mullowney // if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);} 5859ae82921SPaul Mullowney // a->nz = cuspstruct->mat->values.size(); 5869ae82921SPaul Mullowney // a->maxnz = a->nz; /* Since we allocate exactly the right amount */ 5879ae82921SPaul Mullowney // A->preallocated = PETSC_TRUE; 5889ae82921SPaul Mullowney // // Copy ai, aj, aa 5899ae82921SPaul Mullowney // if (a->singlemalloc) { 5909ae82921SPaul Mullowney // if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);} 5919ae82921SPaul Mullowney // } else { 5929ae82921SPaul Mullowney // if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);} 5939ae82921SPaul Mullowney // if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);} 5949ae82921SPaul Mullowney // if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);} 5959ae82921SPaul Mullowney // } 5969ae82921SPaul Mullowney // ierr = PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);CHKERRQ(ierr); 5979ae82921SPaul Mullowney // ierr = PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 5989ae82921SPaul Mullowney // a->singlemalloc = PETSC_TRUE; 5999ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i); 6009ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j); 6019ae82921SPaul Mullowney // thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a); 6029ae82921SPaul Mullowney // // Setup row lengths 6039ae82921SPaul Mullowney // if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);} 6049ae82921SPaul Mullowney // ierr = PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);CHKERRQ(ierr); 6059ae82921SPaul Mullowney // ierr = PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));CHKERRQ(ierr); 6069ae82921SPaul Mullowney // for(i = 0; i < m; ++i) { 6079ae82921SPaul Mullowney // a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i]; 6089ae82921SPaul Mullowney // } 6099ae82921SPaul Mullowney // // a->diag? 6109ae82921SPaul Mullowney // } 6119ae82921SPaul Mullowney // cuspstruct->tempvec = new CUSPARRAY; 6129ae82921SPaul Mullowney // cuspstruct->tempvec->resize(m); 6139ae82921SPaul Mullowney // } catch(char *ex) { 6149ae82921SPaul Mullowney // SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex); 6159ae82921SPaul Mullowney // } 6169ae82921SPaul Mullowney // } 6179ae82921SPaul Mullowney // // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying 6189ae82921SPaul Mullowney // ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 6199ae82921SPaul Mullowney // ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 6209ae82921SPaul Mullowney // A->valid_GPU_matrix = PETSC_CUSP_BOTH; 6219ae82921SPaul Mullowney // } else { 6229ae82921SPaul Mullowney // SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices"); 6239ae82921SPaul Mullowney // } 6249ae82921SPaul Mullowney // PetscFunctionReturn(0); 6259ae82921SPaul Mullowney // } 6269ae82921SPaul Mullowney 6279ae82921SPaul Mullowney #undef __FUNCT__ 6289ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE" 6299ae82921SPaul Mullowney PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left) 6309ae82921SPaul Mullowney { 6319ae82921SPaul Mullowney PetscErrorCode ierr; 6329ae82921SPaul Mullowney 6339ae82921SPaul Mullowney PetscFunctionBegin; 6349ae82921SPaul Mullowney 6359ae82921SPaul Mullowney if (right) { 6369ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr); 6379ae82921SPaul Mullowney ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 6389ae82921SPaul Mullowney ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr); 6399ae82921SPaul Mullowney ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr); 6409ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr); 6419ae82921SPaul Mullowney } 6429ae82921SPaul Mullowney if (left) { 6439ae82921SPaul Mullowney ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr); 6449ae82921SPaul Mullowney ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr); 6459ae82921SPaul Mullowney ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr); 6469ae82921SPaul Mullowney ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr); 6479ae82921SPaul Mullowney ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr); 6489ae82921SPaul Mullowney } 6499ae82921SPaul Mullowney PetscFunctionReturn(0); 6509ae82921SPaul Mullowney } 6519ae82921SPaul Mullowney 6529ae82921SPaul Mullowney #undef __FUNCT__ 6539ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE" 6549ae82921SPaul Mullowney PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 6559ae82921SPaul Mullowney { 6569ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6579ae82921SPaul Mullowney PetscErrorCode ierr; 6589ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 6599ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray; 6609ae82921SPaul Mullowney 6619ae82921SPaul Mullowney PetscFunctionBegin; 6629ae82921SPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 6639ae82921SPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6649ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 6659ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 6669ae82921SPaul Mullowney try { 6679ae82921SPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray);CHKERRCUSP(ierr); 6689ae82921SPaul Mullowney } catch (char* ex) { 6699ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6709ae82921SPaul Mullowney } 6719ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 6729ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 673ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 6749ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 675ca45077fSPaul Mullowney } 6769ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 6779ae82921SPaul Mullowney PetscFunctionReturn(0); 6789ae82921SPaul Mullowney } 6799ae82921SPaul Mullowney 6809ae82921SPaul Mullowney 6819ae82921SPaul Mullowney #undef __FUNCT__ 682ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE" 683ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 684ca45077fSPaul Mullowney { 685ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 686ca45077fSPaul Mullowney PetscErrorCode ierr; 687ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 688ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray; 689ca45077fSPaul Mullowney 690ca45077fSPaul Mullowney PetscFunctionBegin; 691ca45077fSPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 692ca45077fSPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 693ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 694ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr); 695ca45077fSPaul Mullowney try { 696ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 697ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 698ca45077fSPaul Mullowney #else 699ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiply(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 700ca45077fSPaul Mullowney #endif 701ca45077fSPaul Mullowney } catch (char* ex) { 702ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 703ca45077fSPaul Mullowney } 704ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 705ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr); 706ca45077fSPaul Mullowney if (!cusparseMat->mat->hasNonZeroStream()) { 707ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 708ca45077fSPaul Mullowney } 709ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz - cusparseMat->nonzerorow);CHKERRQ(ierr); 710ca45077fSPaul Mullowney PetscFunctionReturn(0); 711ca45077fSPaul Mullowney } 712ca45077fSPaul Mullowney 713ca45077fSPaul Mullowney #undef __FUNCT__ 7149ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 7159ae82921SPaul Mullowney PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 7169ae82921SPaul Mullowney { 7179ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 7189ae82921SPaul Mullowney PetscErrorCode ierr; 7199ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 7209ae82921SPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 7219ae82921SPaul Mullowney PetscFunctionBegin; 7229ae82921SPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 7239ae82921SPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 7249ae82921SPaul Mullowney try { 7259ae82921SPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 7269ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 7279ae82921SPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 7289ae82921SPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 7299ae82921SPaul Mullowney 7309ae82921SPaul Mullowney // multiply add 7319ae82921SPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, zarray);CHKERRCUSP(ierr); 7329ae82921SPaul Mullowney 7339ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 7349ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 7359ae82921SPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 7369ae82921SPaul Mullowney 7379ae82921SPaul Mullowney } catch(char* ex) { 7389ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 7399ae82921SPaul Mullowney } 7409ae82921SPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 7419ae82921SPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 7429ae82921SPaul Mullowney PetscFunctionReturn(0); 7439ae82921SPaul Mullowney } 7449ae82921SPaul Mullowney 7459ae82921SPaul Mullowney #undef __FUNCT__ 746ca45077fSPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE" 747ca45077fSPaul Mullowney PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 748ca45077fSPaul Mullowney { 749ca45077fSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 750ca45077fSPaul Mullowney PetscErrorCode ierr; 751ca45077fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE *)A->spptr; 752ca45077fSPaul Mullowney CUSPARRAY *xarray,*yarray,*zarray; 753ca45077fSPaul Mullowney PetscFunctionBegin; 754ca45077fSPaul Mullowney // The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE 755ca45077fSPaul Mullowney // ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 756ca45077fSPaul Mullowney try { 757ca45077fSPaul Mullowney ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr); 758ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr); 759ca45077fSPaul Mullowney ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr); 760ca45077fSPaul Mullowney ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr); 761ca45077fSPaul Mullowney 762ca45077fSPaul Mullowney // multiply add with matrix transpose 763ca45077fSPaul Mullowney #if !defined(PETSC_USE_COMPLEX) 764ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, TRANSPOSE);CHKERRCUSP(ierr); 765ca45077fSPaul Mullowney #else 766ca45077fSPaul Mullowney ierr = cusparseMat->mat->multiplyAdd(xarray, yarray, HERMITIAN);CHKERRCUSP(ierr); 767ca45077fSPaul Mullowney #endif 768ca45077fSPaul Mullowney 769ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr); 770ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr); 771ca45077fSPaul Mullowney ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr); 772ca45077fSPaul Mullowney 773ca45077fSPaul Mullowney } catch(char* ex) { 774ca45077fSPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 775ca45077fSPaul Mullowney } 776ca45077fSPaul Mullowney ierr = WaitForGPU();CHKERRCUSP(ierr); 777ca45077fSPaul Mullowney ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 778ca45077fSPaul Mullowney PetscFunctionReturn(0); 779ca45077fSPaul Mullowney } 780ca45077fSPaul Mullowney 781ca45077fSPaul Mullowney #undef __FUNCT__ 7829ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE" 7839ae82921SPaul Mullowney PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 7849ae82921SPaul Mullowney { 7859ae82921SPaul Mullowney PetscErrorCode ierr; 7869ae82921SPaul Mullowney PetscFunctionBegin; 7879ae82921SPaul Mullowney ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 7889ae82921SPaul Mullowney ierr = MatCUSPARSECopyToGPU(A);CHKERRQ(ierr); 7899ae82921SPaul Mullowney if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 7909ae82921SPaul Mullowney // this is not necessary since MatCUSPARSECopyToGPU has been called. 7919ae82921SPaul Mullowney //if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 7929ae82921SPaul Mullowney // A->valid_GPU_matrix = PETSC_CUSP_CPU; 7939ae82921SPaul Mullowney //} 794bbf3fe20SPaul Mullowney A->ops->mult = MatMult_SeqAIJCUSPARSE; 795bbf3fe20SPaul Mullowney A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 796bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 797bbf3fe20SPaul Mullowney A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 7989ae82921SPaul Mullowney PetscFunctionReturn(0); 7999ae82921SPaul Mullowney } 8009ae82921SPaul Mullowney 8019ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 8029ae82921SPaul Mullowney #undef __FUNCT__ 8039ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE" 8049ae82921SPaul Mullowney /*@C 8059ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 8069ae82921SPaul Mullowney (the default parallel PETSc format). For good matrix assembly performance 8079ae82921SPaul Mullowney the user should preallocate the matrix storage by setting the parameter nz 8089ae82921SPaul Mullowney (or the array nnz). By setting these parameters accurately, performance 8099ae82921SPaul Mullowney during matrix assembly can be increased by more than a factor of 50. 8109ae82921SPaul Mullowney 8119ae82921SPaul Mullowney Collective on MPI_Comm 8129ae82921SPaul Mullowney 8139ae82921SPaul Mullowney Input Parameters: 8149ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 8159ae82921SPaul Mullowney . m - number of rows 8169ae82921SPaul Mullowney . n - number of columns 8179ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 8189ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 8199ae82921SPaul Mullowney (possibly different for each row) or PETSC_NULL 8209ae82921SPaul Mullowney 8219ae82921SPaul Mullowney Output Parameter: 8229ae82921SPaul Mullowney . A - the matrix 8239ae82921SPaul Mullowney 8249ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 8259ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 8269ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 8279ae82921SPaul Mullowney 8289ae82921SPaul Mullowney Notes: 8299ae82921SPaul Mullowney If nnz is given then nz is ignored 8309ae82921SPaul Mullowney 8319ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 8329ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 8339ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 8349ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 8359ae82921SPaul Mullowney 8369ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 8379ae82921SPaul Mullowney Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 8389ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 8399ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 8409ae82921SPaul Mullowney 8419ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 8429ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 8439ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 8449ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 8459ae82921SPaul Mullowney 8469ae82921SPaul Mullowney Level: intermediate 8479ae82921SPaul Mullowney 8489ae82921SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 8499ae82921SPaul Mullowney 8509ae82921SPaul Mullowney @*/ 8519ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 8529ae82921SPaul Mullowney { 8539ae82921SPaul Mullowney PetscErrorCode ierr; 8549ae82921SPaul Mullowney 8559ae82921SPaul Mullowney PetscFunctionBegin; 8569ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 8579ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 8589ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 8599ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 8609ae82921SPaul Mullowney PetscFunctionReturn(0); 8619ae82921SPaul Mullowney } 8629ae82921SPaul Mullowney 8639ae82921SPaul Mullowney #undef __FUNCT__ 8649ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE" 8659ae82921SPaul Mullowney PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 8669ae82921SPaul Mullowney { 8679ae82921SPaul Mullowney PetscErrorCode ierr; 8689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSE *cusparseMat = (Mat_SeqAIJCUSPARSE*)A->spptr; 8699ae82921SPaul Mullowney 8709ae82921SPaul Mullowney PetscFunctionBegin; 8719ae82921SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 8729ae82921SPaul Mullowney // The regular matrices 8739ae82921SPaul Mullowney try { 8749ae82921SPaul Mullowney if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){ 8759ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)(cusparseMat->mat); 8769ae82921SPaul Mullowney } 8779ae82921SPaul Mullowney if (cusparseMat->tempvec!=0) 8789ae82921SPaul Mullowney delete cusparseMat->tempvec; 8799ae82921SPaul Mullowney delete cusparseMat; 8809ae82921SPaul Mullowney A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 8819ae82921SPaul Mullowney } catch(char* ex) { 8829ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 8839ae82921SPaul Mullowney } 8849ae82921SPaul Mullowney } else { 8859ae82921SPaul Mullowney // The triangular factors 8869ae82921SPaul Mullowney try { 8879ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 8889ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatLo = (GPU_Matrix_Ifc*)cusparseTriFactors->loTriFactorPtr; 8899ae82921SPaul Mullowney GPU_Matrix_Ifc *cusparseMatUp = (GPU_Matrix_Ifc*)cusparseTriFactors->upTriFactorPtr; 8909ae82921SPaul Mullowney // destroy the matrix and the container 8919ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatLo; 8929ae82921SPaul Mullowney delete (GPU_Matrix_Ifc *)cusparseMatUp; 8939ae82921SPaul Mullowney delete (CUSPARRAY*) cusparseTriFactors->tempvec; 8949ae82921SPaul Mullowney delete cusparseTriFactors; 8959ae82921SPaul Mullowney } catch(char* ex) { 8969ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 8979ae82921SPaul Mullowney } 8989ae82921SPaul Mullowney } 8999ae82921SPaul Mullowney if (MAT_cusparseHandle) { 9009ae82921SPaul Mullowney cusparseStatus_t stat; 9019ae82921SPaul Mullowney stat = cusparseDestroy(MAT_cusparseHandle);CHKERRCUSP(stat); 9029ae82921SPaul Mullowney MAT_cusparseHandle=0; 9039ae82921SPaul Mullowney } 9049ae82921SPaul Mullowney /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */ 9059ae82921SPaul Mullowney A->spptr = 0; 9069ae82921SPaul Mullowney 9079ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 9089ae82921SPaul Mullowney PetscFunctionReturn(0); 9099ae82921SPaul Mullowney } 9109ae82921SPaul Mullowney 9119ae82921SPaul Mullowney 9129ae82921SPaul Mullowney EXTERN_C_BEGIN 9139ae82921SPaul Mullowney #undef __FUNCT__ 9149ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE" 9159ae82921SPaul Mullowney PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 9169ae82921SPaul Mullowney { 9179ae82921SPaul Mullowney PetscErrorCode ierr; 918ca45077fSPaul Mullowney GPUStorageFormat format = CSR; 9199ae82921SPaul Mullowney 9209ae82921SPaul Mullowney PetscFunctionBegin; 9219ae82921SPaul Mullowney ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 9229ae82921SPaul Mullowney if (B->factortype==MAT_FACTOR_NONE) { 9239ae82921SPaul Mullowney /* you cannot check the inode.use flag here since the matrix was just created.*/ 924ca45077fSPaul Mullowney // now build a GPU matrix data structure 9259ae82921SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSE; 9269ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->mat = 0; 9279ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->tempvec = 0; 928ca45077fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)B->spptr)->format = format; 9299ae82921SPaul Mullowney } else { 9309ae82921SPaul Mullowney /* NEXT, set the pointers to the triangular factors */ 931debe9ee2SPaul Mullowney B->spptr = new Mat_SeqAIJCUSPARSETriFactors; 9329ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->loTriFactorPtr = 0; 9339ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->upTriFactorPtr = 0; 9349ae82921SPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->tempvec = 0; 935ca45077fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)B->spptr)->format = format; 9369ae82921SPaul Mullowney } 9379ae82921SPaul Mullowney // Create a single instance of the MAT_cusparseHandle for any matrix (matMult, TriSolve, ...) 9389ae82921SPaul Mullowney if (!MAT_cusparseHandle) { 9399ae82921SPaul Mullowney cusparseStatus_t stat; 9409ae82921SPaul Mullowney stat = cusparseCreate(&MAT_cusparseHandle);CHKERRCUSP(stat); 9419ae82921SPaul Mullowney } 9429ae82921SPaul Mullowney // Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the 9439ae82921SPaul Mullowney // default cusparse tri solve. Note the difference with the implementation in 9449ae82921SPaul Mullowney // MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu 9459ae82921SPaul Mullowney ierr = PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_cusparse",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr); 9469ae82921SPaul Mullowney B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 9479ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 9489ae82921SPaul Mullowney B->ops->getvecs = MatGetVecs_SeqAIJCUSPARSE; 9499ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 950ca45077fSPaul Mullowney B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 951ca45077fSPaul Mullowney B->ops->mult = MatMult_SeqAIJCUSPARSE; 952ca45077fSPaul Mullowney B->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 953ca45077fSPaul Mullowney B->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 954ca45077fSPaul Mullowney B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 9559ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 9569ae82921SPaul Mullowney B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED; 9579ae82921SPaul Mullowney PetscFunctionReturn(0); 9589ae82921SPaul Mullowney } 9599ae82921SPaul Mullowney EXTERN_C_END 9609ae82921SPaul Mullowney 961