xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision aa372e3fe8df1a25e86bd398820fb3017c3d0016)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
69ae82921SPaul Mullowney #include "petscconf.h"
79ae82921SPaul Mullowney #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
99ae82921SPaul Mullowney #include "../src/vec/vec/impls/dvecimpl.h"
109ae82921SPaul Mullowney #include "petsc-private/vecimpl.h"
119ae82921SPaul Mullowney #undef VecType
129ae82921SPaul Mullowney #include "cusparsematimpl.h"
13bc3f50f2SPaul Mullowney 
14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
159ae82921SPaul Mullowney 
16087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19087f3262SPaul Mullowney 
206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
216fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
226fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23087f3262SPaul Mullowney 
246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
296fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
316fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
326fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
339ae82921SPaul Mullowney 
349ae82921SPaul Mullowney #undef __FUNCT__
359ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
369ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
379ae82921SPaul Mullowney {
389ae82921SPaul Mullowney   PetscFunctionBegin;
399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
409ae82921SPaul Mullowney   PetscFunctionReturn(0);
419ae82921SPaul Mullowney }
429ae82921SPaul Mullowney 
43c708e6cdSJed Brown /*MC
44087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
45087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
46087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
47087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
48087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
49087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
50c708e6cdSJed Brown 
51087f3262SPaul Mullowney   Consult CUSPARSE documentation for more information about the matrix storage formats
52087f3262SPaul Mullowney   which correspond to the options database keys below.
53c708e6cdSJed Brown 
54c708e6cdSJed Brown    Options Database Keys:
55*aa372e3fSPaul Mullowney .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
569ae82921SPaul Mullowney 
579ae82921SPaul Mullowney   Level: beginner
58c708e6cdSJed Brown 
59c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
60c708e6cdSJed Brown M*/
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney #undef __FUNCT__
639ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
643c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
659ae82921SPaul Mullowney {
669ae82921SPaul Mullowney   PetscErrorCode ierr;
67bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
689ae82921SPaul Mullowney 
699ae82921SPaul Mullowney   PetscFunctionBegin;
70bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
71404133a2SPaul Mullowney   (*B)->factortype = ftype;
72bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
739ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
742205254eSKarl Rupp 
75087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
76bc3f50f2SPaul Mullowney     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
779ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
789ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
79087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
80087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
81087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
829ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83bc3f50f2SPaul Mullowney 
84fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
8562a20339SJed Brown   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
869ae82921SPaul Mullowney   PetscFunctionReturn(0);
879ae82921SPaul Mullowney }
889ae82921SPaul Mullowney 
899ae82921SPaul Mullowney #undef __FUNCT__
90e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
91bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
92ca45077fSPaul Mullowney {
93*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
946e111a19SKarl Rupp 
95ca45077fSPaul Mullowney   PetscFunctionBegin;
96ca45077fSPaul Mullowney   switch (op) {
97e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
98*aa372e3fSPaul Mullowney     cusparsestruct->format = format;
99ca45077fSPaul Mullowney     break;
100e057df02SPaul Mullowney   case MAT_CUSPARSE_SOLVE:
101e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
102ca45077fSPaul Mullowney     break;
103e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
104*aa372e3fSPaul Mullowney     cusparsestruct->format = format;
105e057df02SPaul Mullowney     cusparseMatSolveStorageFormat = format;
106ca45077fSPaul Mullowney     break;
107ca45077fSPaul Mullowney   default:
108e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
109ca45077fSPaul Mullowney   }
110ca45077fSPaul Mullowney   PetscFunctionReturn(0);
111ca45077fSPaul Mullowney }
1129ae82921SPaul Mullowney 
113e057df02SPaul Mullowney /*@
114e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
116*aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
117e057df02SPaul Mullowney    Not Collective
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney    Input Parameters:
1208468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
1218468deeeSKarl Rupp .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1228468deeeSKarl Rupp -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
123e057df02SPaul Mullowney 
124e057df02SPaul Mullowney    Output Parameter:
125e057df02SPaul Mullowney 
126e057df02SPaul Mullowney    Level: intermediate
127e057df02SPaul Mullowney 
1288468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
129e057df02SPaul Mullowney @*/
130e057df02SPaul Mullowney #undef __FUNCT__
131e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
132e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133e057df02SPaul Mullowney {
134e057df02SPaul Mullowney   PetscErrorCode ierr;
1356e111a19SKarl Rupp 
136e057df02SPaul Mullowney   PetscFunctionBegin;
137e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
138e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
139e057df02SPaul Mullowney   PetscFunctionReturn(0);
140e057df02SPaul Mullowney }
141e057df02SPaul Mullowney 
1429ae82921SPaul Mullowney #undef __FUNCT__
1439ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1446fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1459ae82921SPaul Mullowney {
1469ae82921SPaul Mullowney   PetscErrorCode           ierr;
147e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1489ae82921SPaul Mullowney   PetscBool                flg;
1496e111a19SKarl Rupp 
1509ae82921SPaul Mullowney   PetscFunctionBegin;
151e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
152e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1539ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
154e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1557083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
156e057df02SPaul Mullowney     if (flg) {
157e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
158045c96e1SPaul Mullowney     }
1592205254eSKarl Rupp   } else {
160e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
1617083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
162e057df02SPaul Mullowney     if (flg) {
163e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
164e057df02SPaul Mullowney     }
1659ae82921SPaul Mullowney   }
1664c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
1677083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1684c87dfd4SPaul Mullowney   if (flg) {
1694c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1704c87dfd4SPaul Mullowney   }
1719ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1729ae82921SPaul Mullowney   PetscFunctionReturn(0);
1739ae82921SPaul Mullowney 
1749ae82921SPaul Mullowney }
1759ae82921SPaul Mullowney 
1769ae82921SPaul Mullowney #undef __FUNCT__
1779ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
1786fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1799ae82921SPaul Mullowney {
1809ae82921SPaul Mullowney   PetscErrorCode ierr;
1819ae82921SPaul Mullowney 
1829ae82921SPaul Mullowney   PetscFunctionBegin;
1839ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1849ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1859ae82921SPaul Mullowney   PetscFunctionReturn(0);
1869ae82921SPaul Mullowney }
1879ae82921SPaul Mullowney 
1889ae82921SPaul Mullowney #undef __FUNCT__
1899ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
1906fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1919ae82921SPaul Mullowney {
1929ae82921SPaul Mullowney   PetscErrorCode ierr;
1939ae82921SPaul Mullowney 
1949ae82921SPaul Mullowney   PetscFunctionBegin;
1959ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
1969ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1979ae82921SPaul Mullowney   PetscFunctionReturn(0);
1989ae82921SPaul Mullowney }
1999ae82921SPaul Mullowney 
2009ae82921SPaul Mullowney #undef __FUNCT__
201087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
202087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
203087f3262SPaul Mullowney {
204087f3262SPaul Mullowney   PetscErrorCode ierr;
205087f3262SPaul Mullowney 
206087f3262SPaul Mullowney   PetscFunctionBegin;
207087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
208087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
209087f3262SPaul Mullowney   PetscFunctionReturn(0);
210087f3262SPaul Mullowney }
211087f3262SPaul Mullowney 
212087f3262SPaul Mullowney #undef __FUNCT__
213087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
214087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215087f3262SPaul Mullowney {
216087f3262SPaul Mullowney   PetscErrorCode ierr;
217087f3262SPaul Mullowney 
218087f3262SPaul Mullowney   PetscFunctionBegin;
219087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
220087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221087f3262SPaul Mullowney   PetscFunctionReturn(0);
222087f3262SPaul Mullowney }
223087f3262SPaul Mullowney 
224087f3262SPaul Mullowney #undef __FUNCT__
225087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
226087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2279ae82921SPaul Mullowney {
2289ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2299ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
231*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2329ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2339ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2349ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2359ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2369ae82921SPaul Mullowney   PetscScalar                       *AALo;
2379ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
238b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney   PetscFunctionBegin;
2419ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2429ae82921SPaul Mullowney     try {
2439ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2449ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2459ae82921SPaul Mullowney 
2469ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2479ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2489ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2499ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2509ae82921SPaul Mullowney 
2519ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2529ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2539ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2549ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2559ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2569ae82921SPaul Mullowney       v        = aa;
2579ae82921SPaul Mullowney       vi       = aj;
2589ae82921SPaul Mullowney       offset   = 1;
2599ae82921SPaul Mullowney       rowOffset= 1;
2609ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2619ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
262e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2639ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2649ae82921SPaul Mullowney         rowOffset += nz+1;
2659ae82921SPaul Mullowney 
2669ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2679ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2689ae82921SPaul Mullowney 
2699ae82921SPaul Mullowney         offset      += nz;
2709ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
2719ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
2729ae82921SPaul Mullowney         offset      += 1;
2739ae82921SPaul Mullowney 
2749ae82921SPaul Mullowney         v  += nz;
2759ae82921SPaul Mullowney         vi += nz;
2769ae82921SPaul Mullowney       }
2772205254eSKarl Rupp 
278*aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
279*aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
2802205254eSKarl Rupp 
281*aa372e3fSPaul Mullowney       /* Create the matrix description */
282*aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
283*aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
284*aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
285*aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
286*aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
287*aa372e3fSPaul Mullowney 
288*aa372e3fSPaul Mullowney       /* Create the solve analysis information */
289*aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
290*aa372e3fSPaul Mullowney 
291*aa372e3fSPaul Mullowney       /* set the operation */
292*aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
293*aa372e3fSPaul Mullowney 
294*aa372e3fSPaul Mullowney       /* set the matrix */
295*aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
296*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
297*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
298*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
299*aa372e3fSPaul Mullowney 
300*aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
301*aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
302*aa372e3fSPaul Mullowney 
303*aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
304*aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
305*aa372e3fSPaul Mullowney 
306*aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
307*aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
308*aa372e3fSPaul Mullowney 
309*aa372e3fSPaul Mullowney       /* perform the solve analysis */
310*aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
311*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
312*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
313*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
314*aa372e3fSPaul Mullowney 
315*aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
316*aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3172205254eSKarl Rupp 
3189ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
3199ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
3209ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
3219ae82921SPaul Mullowney     } catch(char *ex) {
3229ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3239ae82921SPaul Mullowney     }
3249ae82921SPaul Mullowney   }
3259ae82921SPaul Mullowney   PetscFunctionReturn(0);
3269ae82921SPaul Mullowney }
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney #undef __FUNCT__
329087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
330087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3319ae82921SPaul Mullowney {
3329ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3339ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3349ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
335*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3369ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3379ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3389ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3399ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3409ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3419ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3429ae82921SPaul Mullowney   PetscErrorCode                    ierr;
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney   PetscFunctionBegin;
3459ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
3469ae82921SPaul Mullowney     try {
3479ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3489ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3519ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3529ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3539ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3549ae82921SPaul Mullowney 
3559ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3569ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3579ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3589ae82921SPaul Mullowney       offset = nzUpper;
3599ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3609ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3619ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3629ae82921SPaul Mullowney 
363e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3649ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3659ae82921SPaul Mullowney 
366e057df02SPaul Mullowney         /* decrement the offset */
3679ae82921SPaul Mullowney         offset -= (nz+1);
3689ae82921SPaul Mullowney 
369e057df02SPaul Mullowney         /* first, set the diagonal elements */
3709ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
3719ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
3729ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
3739ae82921SPaul Mullowney 
3749ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3759ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3769ae82921SPaul Mullowney       }
3772205254eSKarl Rupp 
378*aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
379*aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3802205254eSKarl Rupp 
381*aa372e3fSPaul Mullowney       /* Create the matrix description */
382*aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
383*aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
384*aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
385*aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
386*aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
387*aa372e3fSPaul Mullowney 
388*aa372e3fSPaul Mullowney       /* Create the solve analysis information */
389*aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
390*aa372e3fSPaul Mullowney 
391*aa372e3fSPaul Mullowney       /* set the operation */
392*aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
393*aa372e3fSPaul Mullowney 
394*aa372e3fSPaul Mullowney       /* set the matrix */
395*aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
396*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
397*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
398*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
399*aa372e3fSPaul Mullowney 
400*aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
401*aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
402*aa372e3fSPaul Mullowney 
403*aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
404*aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
405*aa372e3fSPaul Mullowney 
406*aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
407*aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
408*aa372e3fSPaul Mullowney 
409*aa372e3fSPaul Mullowney       /* perform the solve analysis */
410*aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
411*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
412*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
413*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
414*aa372e3fSPaul Mullowney 
415*aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
416*aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4172205254eSKarl Rupp 
4189ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
4199ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
4209ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
4219ae82921SPaul Mullowney     } catch(char *ex) {
4229ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4239ae82921SPaul Mullowney     }
4249ae82921SPaul Mullowney   }
4259ae82921SPaul Mullowney   PetscFunctionReturn(0);
4269ae82921SPaul Mullowney }
4279ae82921SPaul Mullowney 
4289ae82921SPaul Mullowney #undef __FUNCT__
429087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
430087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4319ae82921SPaul Mullowney {
4329ae82921SPaul Mullowney   PetscErrorCode               ierr;
4339ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4349ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4359ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4369ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4379ae82921SPaul Mullowney   const PetscInt               *r,*c;
4389ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4399ae82921SPaul Mullowney 
4409ae82921SPaul Mullowney   PetscFunctionBegin;
441087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
442087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4432205254eSKarl Rupp 
444*aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
445*aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
446*aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
4479ae82921SPaul Mullowney 
4489ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
449e057df02SPaul Mullowney   /*lower triangular indices */
4509ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4519ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4522205254eSKarl Rupp   if (!row_identity) {
453*aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
454*aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
4552205254eSKarl Rupp   }
4569ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4579ae82921SPaul Mullowney 
458e057df02SPaul Mullowney   /*upper triangular indices */
4599ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4609ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4612205254eSKarl Rupp   if (!col_identity) {
462*aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
463*aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
4642205254eSKarl Rupp   }
4659ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
4669ae82921SPaul Mullowney   PetscFunctionReturn(0);
4679ae82921SPaul Mullowney }
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney #undef __FUNCT__
470087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
471087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
472087f3262SPaul Mullowney {
473087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
474087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
475*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
476*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
477087f3262SPaul Mullowney   cusparseStatus_t                  stat;
478087f3262SPaul Mullowney   PetscErrorCode                    ierr;
479087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
480087f3262SPaul Mullowney   PetscScalar                       *AAUp;
481087f3262SPaul Mullowney   PetscScalar                       *AALo;
482087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
483087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
484087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
485087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
486087f3262SPaul Mullowney 
487087f3262SPaul Mullowney   PetscFunctionBegin;
488087f3262SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
489087f3262SPaul Mullowney     try {
490087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
491087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
492087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
493087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
494087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
495087f3262SPaul Mullowney 
496087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
497087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
498087f3262SPaul Mullowney       AiUp[n]=nzUpper;
499087f3262SPaul Mullowney       offset = 0;
500087f3262SPaul Mullowney       for (i=0; i<n; i++) {
501087f3262SPaul Mullowney         /* set the pointers */
502087f3262SPaul Mullowney         v  = aa + ai[i];
503087f3262SPaul Mullowney         vj = aj + ai[i];
504087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
505087f3262SPaul Mullowney 
506087f3262SPaul Mullowney         /* first, set the diagonal elements */
507087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
508087f3262SPaul Mullowney         AAUp[offset] = 1.0/v[nz];
509087f3262SPaul Mullowney         AiUp[i]      = offset;
510087f3262SPaul Mullowney         AALo[offset] = 1.0/v[nz];
511087f3262SPaul Mullowney 
512087f3262SPaul Mullowney         offset+=1;
513087f3262SPaul Mullowney         if (nz>0) {
514087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
515087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
516087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
517087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
518087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
519087f3262SPaul Mullowney           }
520087f3262SPaul Mullowney           offset+=nz;
521087f3262SPaul Mullowney         }
522087f3262SPaul Mullowney       }
523087f3262SPaul Mullowney 
524*aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
525*aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
526087f3262SPaul Mullowney 
527*aa372e3fSPaul Mullowney       /* Create the matrix description */
528*aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
529*aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
530*aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
531*aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
532*aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
533087f3262SPaul Mullowney 
534*aa372e3fSPaul Mullowney       /* Create the solve analysis information */
535*aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
536*aa372e3fSPaul Mullowney 
537*aa372e3fSPaul Mullowney       /* set the operation */
538*aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
539*aa372e3fSPaul Mullowney 
540*aa372e3fSPaul Mullowney       /* set the matrix */
541*aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
542*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
543*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
544*aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
545*aa372e3fSPaul Mullowney 
546*aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
547*aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
548*aa372e3fSPaul Mullowney 
549*aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
550*aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
551*aa372e3fSPaul Mullowney 
552*aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
553*aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
554*aa372e3fSPaul Mullowney 
555*aa372e3fSPaul Mullowney       /* perform the solve analysis */
556*aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
557*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
558*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
559*aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
560*aa372e3fSPaul Mullowney 
561*aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
562*aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
563*aa372e3fSPaul Mullowney 
564*aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
565*aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
566*aa372e3fSPaul Mullowney 
567*aa372e3fSPaul Mullowney       /* Create the matrix description */
568*aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
569*aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
570*aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
571*aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
572*aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
573*aa372e3fSPaul Mullowney 
574*aa372e3fSPaul Mullowney       /* Create the solve analysis information */
575*aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
576*aa372e3fSPaul Mullowney 
577*aa372e3fSPaul Mullowney       /* set the operation */
578*aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
579*aa372e3fSPaul Mullowney 
580*aa372e3fSPaul Mullowney       /* set the matrix */
581*aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
582*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
583*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
584*aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
585*aa372e3fSPaul Mullowney 
586*aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
587*aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
588*aa372e3fSPaul Mullowney 
589*aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
590*aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
591*aa372e3fSPaul Mullowney 
592*aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
593*aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
594*aa372e3fSPaul Mullowney 
595*aa372e3fSPaul Mullowney       /* perform the solve analysis */
596*aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
597*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
598*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
599*aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
600*aa372e3fSPaul Mullowney 
601*aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
602*aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
603087f3262SPaul Mullowney 
604087f3262SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
605087f3262SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
606087f3262SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
607087f3262SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
608087f3262SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
609087f3262SPaul Mullowney     } catch(char *ex) {
610087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
611087f3262SPaul Mullowney     }
612087f3262SPaul Mullowney   }
613087f3262SPaul Mullowney   PetscFunctionReturn(0);
614087f3262SPaul Mullowney }
615087f3262SPaul Mullowney 
616087f3262SPaul Mullowney #undef __FUNCT__
617087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
618087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6199ae82921SPaul Mullowney {
6209ae82921SPaul Mullowney   PetscErrorCode               ierr;
621087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
622087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
623087f3262SPaul Mullowney   IS                           ip = a->row;
624087f3262SPaul Mullowney   const PetscInt               *rip;
625087f3262SPaul Mullowney   PetscBool                    perm_identity;
626087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
627087f3262SPaul Mullowney 
628087f3262SPaul Mullowney   PetscFunctionBegin;
629087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
630*aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
631*aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
632*aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
633*aa372e3fSPaul Mullowney 
634087f3262SPaul Mullowney   /*lower triangular indices */
635087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
636087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
637087f3262SPaul Mullowney   if (!perm_identity) {
638*aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
639*aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
640*aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
641*aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
642087f3262SPaul Mullowney   }
643087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
644087f3262SPaul Mullowney   PetscFunctionReturn(0);
645087f3262SPaul Mullowney }
646087f3262SPaul Mullowney 
647087f3262SPaul Mullowney #undef __FUNCT__
6489ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
6496fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
6509ae82921SPaul Mullowney {
6519ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
6529ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
6539ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
654b175d8bbSPaul Mullowney   PetscErrorCode ierr;
6559ae82921SPaul Mullowney 
6569ae82921SPaul Mullowney   PetscFunctionBegin;
6579ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
658e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
6599ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6609ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
661bda325fcSPaul Mullowney   if (row_identity && col_identity) {
662bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
663bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
664bda325fcSPaul Mullowney   } else {
665bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
666bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
667bda325fcSPaul Mullowney   }
6688dc1d2a3SPaul Mullowney 
669e057df02SPaul Mullowney   /* get the triangular factors */
670087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
6719ae82921SPaul Mullowney   PetscFunctionReturn(0);
6729ae82921SPaul Mullowney }
6739ae82921SPaul Mullowney 
674087f3262SPaul Mullowney #undef __FUNCT__
675087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
676087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677087f3262SPaul Mullowney {
678087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
679087f3262SPaul Mullowney   IS             ip = b->row;
680087f3262SPaul Mullowney   PetscBool      perm_identity;
681b175d8bbSPaul Mullowney   PetscErrorCode ierr;
682087f3262SPaul Mullowney 
683087f3262SPaul Mullowney   PetscFunctionBegin;
684087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
685087f3262SPaul Mullowney 
686087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
687087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
688087f3262SPaul Mullowney   if (perm_identity) {
689087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691087f3262SPaul Mullowney   } else {
692087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694087f3262SPaul Mullowney   }
695087f3262SPaul Mullowney 
696087f3262SPaul Mullowney   /* get the triangular factors */
697087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698087f3262SPaul Mullowney   PetscFunctionReturn(0);
699087f3262SPaul Mullowney }
7009ae82921SPaul Mullowney 
701bda325fcSPaul Mullowney #undef __FUNCT__
702bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
703b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
704bda325fcSPaul Mullowney {
705bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
709*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
710bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
711*aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
712*aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
713*aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
714*aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
715b175d8bbSPaul Mullowney 
716bda325fcSPaul Mullowney   PetscFunctionBegin;
717bda325fcSPaul Mullowney 
718*aa372e3fSPaul Mullowney   /*********************************************/
719*aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
720*aa372e3fSPaul Mullowney   /*********************************************/
721*aa372e3fSPaul Mullowney 
722*aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
723*aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
724*aa372e3fSPaul Mullowney 
725*aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
726*aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
727*aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
728*aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
729*aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
730*aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
731*aa372e3fSPaul Mullowney 
732*aa372e3fSPaul Mullowney   /* Create the matrix description */
733*aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
734*aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
735*aa372e3fSPaul Mullowney   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
736*aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
737*aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
738*aa372e3fSPaul Mullowney 
739*aa372e3fSPaul Mullowney   /* Create the solve analysis information */
740*aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
741*aa372e3fSPaul Mullowney 
742*aa372e3fSPaul Mullowney   /* set the operation */
743*aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
744*aa372e3fSPaul Mullowney 
745*aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
746*aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
747*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
748*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
749*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
750*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
751*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
752*aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
753*aa372e3fSPaul Mullowney 
754*aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
755*aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
756*aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
757*aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
758*aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
759*aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
760*aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
761*aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
762*aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
763*aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
764*aa372e3fSPaul Mullowney 
765*aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
766*aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
767*aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
768*aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
769*aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
770*aa372e3fSPaul Mullowney                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
771*aa372e3fSPaul Mullowney 
772*aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
773*aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
774*aa372e3fSPaul Mullowney 
775*aa372e3fSPaul Mullowney   /*********************************************/
776*aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
777*aa372e3fSPaul Mullowney   /*********************************************/
778*aa372e3fSPaul Mullowney 
779*aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
780*aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
781*aa372e3fSPaul Mullowney 
782*aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
783*aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
784*aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
785*aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
786*aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
787*aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
788*aa372e3fSPaul Mullowney 
789*aa372e3fSPaul Mullowney   /* Create the matrix description */
790*aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
791*aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
792*aa372e3fSPaul Mullowney   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
793*aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
794*aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
795*aa372e3fSPaul Mullowney 
796*aa372e3fSPaul Mullowney   /* Create the solve analysis information */
797*aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
798*aa372e3fSPaul Mullowney 
799*aa372e3fSPaul Mullowney   /* set the operation */
800*aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
801*aa372e3fSPaul Mullowney 
802*aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
803*aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
804*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
805*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
806*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
807*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
808*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
809*aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
810*aa372e3fSPaul Mullowney 
811*aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
812*aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
813*aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
814*aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
815*aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
816*aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
817*aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
818*aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
819*aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
820*aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
821*aa372e3fSPaul Mullowney 
822*aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
823*aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
824*aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
825*aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
826*aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
827*aa372e3fSPaul Mullowney                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
828*aa372e3fSPaul Mullowney 
829*aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
830*aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
831bda325fcSPaul Mullowney   PetscFunctionReturn(0);
832bda325fcSPaul Mullowney }
833bda325fcSPaul Mullowney 
834bda325fcSPaul Mullowney #undef __FUNCT__
835bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
836b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
837bda325fcSPaul Mullowney {
838*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
839*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
840*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
841bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
842bda325fcSPaul Mullowney   cusparseStatus_t             stat;
843*aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
844b175d8bbSPaul Mullowney 
845bda325fcSPaul Mullowney   PetscFunctionBegin;
846*aa372e3fSPaul Mullowney 
847*aa372e3fSPaul Mullowney   /* allocate space for the triangular factor information */
848*aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
849*aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
850*aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
851*aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
852*aa372e3fSPaul Mullowney   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
853*aa372e3fSPaul Mullowney 
854*aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
855*aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
856*aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
857*aa372e3fSPaul Mullowney     matrixT->num_rows = A->rmap->n;
858*aa372e3fSPaul Mullowney     matrixT->num_cols = A->cmap->n;
859*aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
860*aa372e3fSPaul Mullowney     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
861*aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
862*aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
863*aa372e3fSPaul Mullowney 
864*aa372e3fSPaul Mullowney     /* compute the transpose of the upper triangular factor, i.e. the CSC */
865*aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
866*aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
867*aa372e3fSPaul Mullowney                             matrix->num_cols, matrix->num_entries,
868*aa372e3fSPaul Mullowney                             matrix->values->data().get(),
869*aa372e3fSPaul Mullowney                             matrix->row_offsets->data().get(),
870*aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
871*aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
872*aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
873*aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
874*aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
875*aa372e3fSPaul Mullowney 
876*aa372e3fSPaul Mullowney     /* assign the pointer */
877*aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
878*aa372e3fSPaul Mullowney 
879*aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
880*aa372e3fSPaul Mullowney 
881*aa372e3fSPaul Mullowney     /* First convert HYB to CSR */
882*aa372e3fSPaul Mullowney     CsrMatrix *temp= new CsrMatrix;
883*aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
884*aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
885*aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
886*aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
887*aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
888*aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
889*aa372e3fSPaul Mullowney 
890*aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
891*aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
892*aa372e3fSPaul Mullowney                             temp->values->data().get(),
893*aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
894*aa372e3fSPaul Mullowney                             temp->column_indices->data().get());CHKERRCUSP(stat);
895*aa372e3fSPaul Mullowney 
896*aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
897*aa372e3fSPaul Mullowney     CsrMatrix *tempT= new CsrMatrix;
898*aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
899*aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
900*aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
901*aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
902*aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
903*aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
904*aa372e3fSPaul Mullowney 
905*aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
906*aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
907*aa372e3fSPaul Mullowney                             temp->values->data().get(),
908*aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
909*aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
910*aa372e3fSPaul Mullowney                             tempT->values->data().get(),
911*aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
912*aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
913*aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
914*aa372e3fSPaul Mullowney 
915*aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
916*aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
917*aa372e3fSPaul Mullowney     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
918*aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
919*aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
920*aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
921*aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
922*aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
923*aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
924*aa372e3fSPaul Mullowney                             hybMat, 0, partition);CHKERRCUSP(stat);
925*aa372e3fSPaul Mullowney 
926*aa372e3fSPaul Mullowney     /* assign the pointer */
927*aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
928*aa372e3fSPaul Mullowney 
929*aa372e3fSPaul Mullowney     /* delete temporaries */
930*aa372e3fSPaul Mullowney     if (tempT) {
931*aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
932*aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
933*aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
934*aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
935087f3262SPaul Mullowney     }
936*aa372e3fSPaul Mullowney     if (temp) {
937*aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
938*aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
939*aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
940*aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
941*aa372e3fSPaul Mullowney     }
942*aa372e3fSPaul Mullowney   }
943*aa372e3fSPaul Mullowney   /* assign the compressed row indices */
944*aa372e3fSPaul Mullowney   matstructT->cprowIndices = new THRUSTINTARRAY;
945*aa372e3fSPaul Mullowney 
946*aa372e3fSPaul Mullowney   /* assign the pointer */
947*aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
948bda325fcSPaul Mullowney   PetscFunctionReturn(0);
949bda325fcSPaul Mullowney }
950bda325fcSPaul Mullowney 
951bda325fcSPaul Mullowney #undef __FUNCT__
952bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
9536fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
954bda325fcSPaul Mullowney {
955bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU, *bGPU;
956bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
957bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
958*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
959*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
960*aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
961b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
962bda325fcSPaul Mullowney 
963bda325fcSPaul Mullowney   PetscFunctionBegin;
964*aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
965*aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
966bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
967*aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
968*aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
969bda325fcSPaul Mullowney   }
970bda325fcSPaul Mullowney 
971bda325fcSPaul Mullowney   /* Get the GPU pointers */
972bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
973bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
974bda325fcSPaul Mullowney 
975*aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
976*aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
977*aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
978*aa372e3fSPaul Mullowney                xGPU->begin());
979*aa372e3fSPaul Mullowney 
980*aa372e3fSPaul Mullowney   /* First, solve U */
981*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
982*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
983*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
984*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
985*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
986*aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
987*aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
988*aa372e3fSPaul Mullowney 
989*aa372e3fSPaul Mullowney   /* Then, solve L */
990*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
991*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
992*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
993*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
994*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
995*aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
996*aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
997*aa372e3fSPaul Mullowney 
998*aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
999*aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1000*aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1001*aa372e3fSPaul Mullowney                tempGPU->begin());
1002*aa372e3fSPaul Mullowney 
1003*aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1004*aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1005bda325fcSPaul Mullowney 
1006bda325fcSPaul Mullowney   /* restore */
1007bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1008bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1009bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1010087f3262SPaul Mullowney 
1011*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1012bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1013bda325fcSPaul Mullowney }
1014bda325fcSPaul Mullowney 
1015bda325fcSPaul Mullowney #undef __FUNCT__
1016bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
10176fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1018bda325fcSPaul Mullowney {
1019bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
1020bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1021bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1022*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1023*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1024*aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1025b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1026bda325fcSPaul Mullowney 
1027bda325fcSPaul Mullowney   PetscFunctionBegin;
1028*aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1029*aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1030bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1031*aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1032*aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1033bda325fcSPaul Mullowney   }
1034bda325fcSPaul Mullowney 
1035bda325fcSPaul Mullowney   /* Get the GPU pointers */
1036bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1037bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1038bda325fcSPaul Mullowney 
1039*aa372e3fSPaul Mullowney   /* First, solve U */
1040*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1042*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1043*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1044*aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1045*aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1046*aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1047*aa372e3fSPaul Mullowney 
1048*aa372e3fSPaul Mullowney   /* Then, solve L */
1049*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1051*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1052*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1053*aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1054*aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1055*aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1056bda325fcSPaul Mullowney 
1057bda325fcSPaul Mullowney   /* restore */
1058bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1059bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1060bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1061*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1062bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1063bda325fcSPaul Mullowney }
1064bda325fcSPaul Mullowney 
10659ae82921SPaul Mullowney #undef __FUNCT__
10669ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
10676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
10689ae82921SPaul Mullowney {
1069bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
10709ae82921SPaul Mullowney   cusparseStatus_t                  stat;
10719ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1072*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1073*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1074*aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1075b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
10769ae82921SPaul Mullowney 
10779ae82921SPaul Mullowney   PetscFunctionBegin;
1078e057df02SPaul Mullowney   /* Get the GPU pointers */
10799ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
10809ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
10819ae82921SPaul Mullowney 
1082*aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1083*aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1084*aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1085*aa372e3fSPaul Mullowney                xGPU->begin());
1086*aa372e3fSPaul Mullowney 
1087*aa372e3fSPaul Mullowney   /* Next, solve L */
1088*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1089*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1090*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1091*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1092*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1093*aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1094*aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1095*aa372e3fSPaul Mullowney 
1096*aa372e3fSPaul Mullowney   /* Then, solve U */
1097*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1098*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1099*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1100*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1101*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1102*aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1103*aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1104*aa372e3fSPaul Mullowney 
1105*aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1106*aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1107*aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1108*aa372e3fSPaul Mullowney                tempGPU->begin());
1109*aa372e3fSPaul Mullowney 
1110*aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1111*aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
11129ae82921SPaul Mullowney 
11139ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
11149ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11159ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1116*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
11179ae82921SPaul Mullowney   PetscFunctionReturn(0);
11189ae82921SPaul Mullowney }
11199ae82921SPaul Mullowney 
11209ae82921SPaul Mullowney #undef __FUNCT__
11219ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
11226fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
11239ae82921SPaul Mullowney {
1124bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
11259ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11269ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1127*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1128*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1129*aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1130b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11319ae82921SPaul Mullowney 
11329ae82921SPaul Mullowney   PetscFunctionBegin;
1133e057df02SPaul Mullowney   /* Get the GPU pointers */
11349ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11359ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
11369ae82921SPaul Mullowney 
1137*aa372e3fSPaul Mullowney   /* First, solve L */
1138*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1139*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1140*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1141*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1142*aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1143*aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1144*aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1145*aa372e3fSPaul Mullowney 
1146*aa372e3fSPaul Mullowney   /* Next, solve U */
1147*aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1148*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1149*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1150*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1151*aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1152*aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1153*aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
11549ae82921SPaul Mullowney 
11559ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
11569ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11579ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1158*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
11599ae82921SPaul Mullowney   PetscFunctionReturn(0);
11609ae82921SPaul Mullowney }
11619ae82921SPaul Mullowney 
11629ae82921SPaul Mullowney #undef __FUNCT__
1163e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
11646fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
11659ae82921SPaul Mullowney {
11669ae82921SPaul Mullowney 
1167*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1168*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
11699ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
11709ae82921SPaul Mullowney   PetscInt                     m = A->rmap->n,*ii,*ridx;
11719ae82921SPaul Mullowney   PetscErrorCode               ierr;
1172*aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
11739ae82921SPaul Mullowney 
11749ae82921SPaul Mullowney   PetscFunctionBegin;
11759ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
11769ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1177*aa372e3fSPaul Mullowney     if (matstruct) {
1178*aa372e3fSPaul Mullowney       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
11799ae82921SPaul Mullowney     }
11809ae82921SPaul Mullowney     try {
1181*aa372e3fSPaul Mullowney       cusparsestruct->nonzerorow=0;
1182*aa372e3fSPaul Mullowney       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
11839ae82921SPaul Mullowney 
11849ae82921SPaul Mullowney       if (a->compressedrow.use) {
11859ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
11869ae82921SPaul Mullowney         ii   = a->compressedrow.i;
11879ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
11889ae82921SPaul Mullowney       } else {
1189e057df02SPaul Mullowney         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
11909ae82921SPaul Mullowney         int k=0;
1191*aa372e3fSPaul Mullowney         ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
1192*aa372e3fSPaul Mullowney         ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
11939ae82921SPaul Mullowney         ii[0]=0;
11949ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
11959ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
11969ae82921SPaul Mullowney             ii[k]  = a->i[j];
11979ae82921SPaul Mullowney             ridx[k]= j;
11989ae82921SPaul Mullowney             k++;
11999ae82921SPaul Mullowney           }
12009ae82921SPaul Mullowney         }
1201*aa372e3fSPaul Mullowney         ii[cusparsestruct->nonzerorow] = a->nz;
12022205254eSKarl Rupp 
1203*aa372e3fSPaul Mullowney         m = cusparsestruct->nonzerorow;
12049ae82921SPaul Mullowney       }
12059ae82921SPaul Mullowney 
1206*aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
1207*aa372e3fSPaul Mullowney       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1208*aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1209*aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1210*aa372e3fSPaul Mullowney       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
12119ae82921SPaul Mullowney 
1212*aa372e3fSPaul Mullowney       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1213*aa372e3fSPaul Mullowney       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1214*aa372e3fSPaul Mullowney         /* set the matrix */
1215*aa372e3fSPaul Mullowney 	CsrMatrix *matrix= new CsrMatrix;
1216*aa372e3fSPaul Mullowney 	matrix->num_rows = A->rmap->n;
1217*aa372e3fSPaul Mullowney 	matrix->num_cols = A->cmap->n;
1218*aa372e3fSPaul Mullowney 	matrix->num_entries = a->nz;
1219*aa372e3fSPaul Mullowney 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1220*aa372e3fSPaul Mullowney 	matrix->row_offsets->assign(ii, ii + A->rmap->n+1);
12219ae82921SPaul Mullowney 
1222*aa372e3fSPaul Mullowney 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1223*aa372e3fSPaul Mullowney 	matrix->column_indices->assign(a->j, a->j+a->nz);
1224*aa372e3fSPaul Mullowney 
1225*aa372e3fSPaul Mullowney 	matrix->values = new THRUSTARRAY(a->nz);
1226*aa372e3fSPaul Mullowney 	matrix->values->assign(a->a, a->a+a->nz);
1227*aa372e3fSPaul Mullowney 
1228*aa372e3fSPaul Mullowney         /* assign the pointer */
1229*aa372e3fSPaul Mullowney         matstruct->mat = matrix;
1230*aa372e3fSPaul Mullowney 
1231*aa372e3fSPaul Mullowney       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1232*aa372e3fSPaul Mullowney 
1233*aa372e3fSPaul Mullowney 	CsrMatrix *matrix= new CsrMatrix;
1234*aa372e3fSPaul Mullowney 	matrix->num_rows = A->rmap->n;
1235*aa372e3fSPaul Mullowney 	matrix->num_cols = A->cmap->n;
1236*aa372e3fSPaul Mullowney 	matrix->num_entries = a->nz;
1237*aa372e3fSPaul Mullowney 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1238*aa372e3fSPaul Mullowney 	matrix->row_offsets->assign(ii, ii + A->rmap->n+1);
1239*aa372e3fSPaul Mullowney 
1240*aa372e3fSPaul Mullowney 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1241*aa372e3fSPaul Mullowney 	matrix->column_indices->assign(a->j, a->j+a->nz);
1242*aa372e3fSPaul Mullowney 
1243*aa372e3fSPaul Mullowney 	matrix->values = new THRUSTARRAY(a->nz);
1244*aa372e3fSPaul Mullowney 	matrix->values->assign(a->a, a->a+a->nz);
1245*aa372e3fSPaul Mullowney 
1246*aa372e3fSPaul Mullowney         cusparseHybMat_t hybMat;
1247*aa372e3fSPaul Mullowney         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1248*aa372e3fSPaul Mullowney         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1249*aa372e3fSPaul Mullowney           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1250*aa372e3fSPaul Mullowney         stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1251*aa372e3fSPaul Mullowney                                 matstruct->descr, matrix->values->data().get(),
1252*aa372e3fSPaul Mullowney                                 matrix->row_offsets->data().get(),
1253*aa372e3fSPaul Mullowney                                 matrix->column_indices->data().get(),
1254*aa372e3fSPaul Mullowney                                 hybMat, 0, partition);CHKERRCUSP(stat);
1255*aa372e3fSPaul Mullowney         /* assign the pointer */
1256*aa372e3fSPaul Mullowney         matstruct->mat = hybMat;
1257*aa372e3fSPaul Mullowney 
1258*aa372e3fSPaul Mullowney 	if (matrix) {
1259*aa372e3fSPaul Mullowney 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1260*aa372e3fSPaul Mullowney 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1261*aa372e3fSPaul Mullowney 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1262*aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)matrix;
1263087f3262SPaul Mullowney 	}
1264087f3262SPaul Mullowney       }
1265ca45077fSPaul Mullowney 
1266*aa372e3fSPaul Mullowney       /* assign the compressed row indices */
1267*aa372e3fSPaul Mullowney       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1268*aa372e3fSPaul Mullowney       matstruct->cprowIndices->assign(ridx,ridx+m);
1269*aa372e3fSPaul Mullowney 
1270*aa372e3fSPaul Mullowney       /* assign the pointer */
1271*aa372e3fSPaul Mullowney       cusparsestruct->mat = matstruct;
1272*aa372e3fSPaul Mullowney 
12739ae82921SPaul Mullowney       if (!a->compressedrow.use) {
12749ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
12759ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
12769ae82921SPaul Mullowney       }
1277*aa372e3fSPaul Mullowney       cusparsestruct->workVector = new THRUSTARRAY;
1278*aa372e3fSPaul Mullowney       cusparsestruct->workVector->resize(m);
12799ae82921SPaul Mullowney     } catch(char *ex) {
12809ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
12819ae82921SPaul Mullowney     }
12829ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
12832205254eSKarl Rupp 
12849ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
12852205254eSKarl Rupp 
12869ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
12879ae82921SPaul Mullowney   }
12889ae82921SPaul Mullowney   PetscFunctionReturn(0);
12899ae82921SPaul Mullowney }
12909ae82921SPaul Mullowney 
12919ae82921SPaul Mullowney #undef __FUNCT__
12929ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
12936fa9248bSJed Brown static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
12949ae82921SPaul Mullowney {
12959ae82921SPaul Mullowney   PetscErrorCode ierr;
12969ae82921SPaul Mullowney 
12979ae82921SPaul Mullowney   PetscFunctionBegin;
12989ae82921SPaul Mullowney   if (right) {
1299ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
13009ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
13019ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
13029ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
13039ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
13049ae82921SPaul Mullowney   }
13059ae82921SPaul Mullowney   if (left) {
1306ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
13079ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
13089ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
13099ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
13109ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
13119ae82921SPaul Mullowney   }
13129ae82921SPaul Mullowney   PetscFunctionReturn(0);
13139ae82921SPaul Mullowney }
13149ae82921SPaul Mullowney 
1315*aa372e3fSPaul Mullowney struct VecCUSPPlusEquals
1316*aa372e3fSPaul Mullowney {
1317*aa372e3fSPaul Mullowney   template <typename Tuple>
1318*aa372e3fSPaul Mullowney   __host__ __device__
1319*aa372e3fSPaul Mullowney   void operator()(Tuple t)
1320*aa372e3fSPaul Mullowney   {
1321*aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1322*aa372e3fSPaul Mullowney   }
1323*aa372e3fSPaul Mullowney };
1324*aa372e3fSPaul Mullowney 
13259ae82921SPaul Mullowney #undef __FUNCT__
13269ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
13276fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
13289ae82921SPaul Mullowney {
13299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1330*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1331*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1332bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1333b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1334*aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
13359ae82921SPaul Mullowney 
13369ae82921SPaul Mullowney   PetscFunctionBegin;
1337e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1338e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
13399ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
13409ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1341*aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1342*aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1343*aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1344*aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols, mat->num_entries,
1345*aa372e3fSPaul Mullowney                              &ALPHA, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1346*aa372e3fSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), &BETA,
1347*aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1348*aa372e3fSPaul Mullowney   } else {
1349*aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1350*aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1351*aa372e3fSPaul Mullowney                              &ALPHA, matstruct->descr, hybMat,
1352*aa372e3fSPaul Mullowney                              xarray->data().get(), &BETA,
1353*aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
13549ae82921SPaul Mullowney   }
13559ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
13569ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1357*aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
13589ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1359ca45077fSPaul Mullowney   }
1360*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
13619ae82921SPaul Mullowney   PetscFunctionReturn(0);
13629ae82921SPaul Mullowney }
13639ae82921SPaul Mullowney 
13649ae82921SPaul Mullowney #undef __FUNCT__
1365ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
13666fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1367ca45077fSPaul Mullowney {
1368ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1369*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1370*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1371bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1372b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1373*aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1374ca45077fSPaul Mullowney 
1375ca45077fSPaul Mullowney   PetscFunctionBegin;
1376e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1377e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1378*aa372e3fSPaul Mullowney   if (!matstructT) {
1379bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1380*aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1381bda325fcSPaul Mullowney   }
1382ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1383ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1384*aa372e3fSPaul Mullowney 
1385*aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1386*aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1387*aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1388*aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols,
1389*aa372e3fSPaul Mullowney                              mat->num_entries, &ALPHA, matstructT->descr,
1390*aa372e3fSPaul Mullowney                              mat->values->data().get(), mat->row_offsets->data().get(),
1391*aa372e3fSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), &BETA,
1392*aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1393*aa372e3fSPaul Mullowney   } else {
1394*aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1395*aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1396*aa372e3fSPaul Mullowney                              &ALPHA, matstructT->descr, hybMat,
1397*aa372e3fSPaul Mullowney                              xarray->data().get(), &BETA,
1398*aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1399ca45077fSPaul Mullowney   }
1400ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1401ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1402*aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
1403ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1404ca45077fSPaul Mullowney   }
1405*aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1406ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1407ca45077fSPaul Mullowney }
1408ca45077fSPaul Mullowney 
1409*aa372e3fSPaul Mullowney 
1410ca45077fSPaul Mullowney #undef __FUNCT__
14119ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
14126fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
14139ae82921SPaul Mullowney {
14149ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1415*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1416*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1417bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1418b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1419*aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
14206e111a19SKarl Rupp 
14219ae82921SPaul Mullowney   PetscFunctionBegin;
1422e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1423e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
14249ae82921SPaul Mullowney   try {
14259ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
14269ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
14279ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
14289ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
14299ae82921SPaul Mullowney 
1430e057df02SPaul Mullowney     /* multiply add */
1431*aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1432*aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1433*aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1434*aa372e3fSPaul Mullowney                                mat->num_rows, mat->num_cols,
1435*aa372e3fSPaul Mullowney                                mat->num_entries, &ALPHA, matstruct->descr,
1436*aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1437*aa372e3fSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), &BETA,
1438*aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1439*aa372e3fSPaul Mullowney     } else {
1440*aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1441*aa372e3fSPaul Mullowney       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1442*aa372e3fSPaul Mullowney 			       &ALPHA, matstruct->descr, hybMat,
1443*aa372e3fSPaul Mullowney 			       xarray->data().get(), &BETA,
1444*aa372e3fSPaul Mullowney 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1445*aa372e3fSPaul Mullowney     }
1446*aa372e3fSPaul Mullowney 
1447*aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1448*aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1449*aa372e3fSPaul Mullowney 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1450*aa372e3fSPaul Mullowney 		     VecCUSPPlusEquals());
14519ae82921SPaul Mullowney 
14529ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
14539ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
14549ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
14559ae82921SPaul Mullowney 
14569ae82921SPaul Mullowney   } catch(char *ex) {
14579ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
14589ae82921SPaul Mullowney   }
14599ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
14609ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
14619ae82921SPaul Mullowney   PetscFunctionReturn(0);
14629ae82921SPaul Mullowney }
14639ae82921SPaul Mullowney 
14649ae82921SPaul Mullowney #undef __FUNCT__
1465b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
14666fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1467ca45077fSPaul Mullowney {
1468ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1469*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1470*aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1471ca45077fSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1472b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1473*aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
14746e111a19SKarl Rupp 
1475ca45077fSPaul Mullowney   PetscFunctionBegin;
1476e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1477e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1478*aa372e3fSPaul Mullowney   if (!matstructT) {
1479bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1480*aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1481bda325fcSPaul Mullowney   }
1482*aa372e3fSPaul Mullowney 
1483ca45077fSPaul Mullowney   try {
1484ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1485ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1486ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1487ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1488ca45077fSPaul Mullowney 
1489e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1490*aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1491*aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1492*aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1493*aa372e3fSPaul Mullowney                                mat->num_rows, mat->num_cols, mat->num_entries, &ALPHA, matstructT->descr,
1494*aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1495*aa372e3fSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), &BETA,
1496*aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1497*aa372e3fSPaul Mullowney     } else {
1498*aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1499*aa372e3fSPaul Mullowney       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1500*aa372e3fSPaul Mullowney 			       &ALPHA, matstructT->descr, hybMat,
1501*aa372e3fSPaul Mullowney 			       xarray->data().get(), &BETA,
1502*aa372e3fSPaul Mullowney 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1503*aa372e3fSPaul Mullowney     }
1504*aa372e3fSPaul Mullowney 
1505*aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1506*aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1507*aa372e3fSPaul Mullowney 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1508*aa372e3fSPaul Mullowney 		     VecCUSPPlusEquals());
1509ca45077fSPaul Mullowney 
1510ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1511ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1512ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1513ca45077fSPaul Mullowney 
1514ca45077fSPaul Mullowney   } catch(char *ex) {
1515ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1516ca45077fSPaul Mullowney   }
1517ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1518ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1519ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1520ca45077fSPaul Mullowney }
1521ca45077fSPaul Mullowney 
1522ca45077fSPaul Mullowney #undef __FUNCT__
15239ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
15246fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
15259ae82921SPaul Mullowney {
15269ae82921SPaul Mullowney   PetscErrorCode ierr;
15276e111a19SKarl Rupp 
15289ae82921SPaul Mullowney   PetscFunctionBegin;
15299ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1530bc3f50f2SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1531e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1532bc3f50f2SPaul Mullowney   }
15339ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1534bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1535bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1536bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1537bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
15389ae82921SPaul Mullowney   PetscFunctionReturn(0);
15399ae82921SPaul Mullowney }
15409ae82921SPaul Mullowney 
15419ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
15429ae82921SPaul Mullowney #undef __FUNCT__
15439ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1544e057df02SPaul Mullowney /*@
15459ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1546e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1547e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1548e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1549e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1550e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
15519ae82921SPaul Mullowney 
15529ae82921SPaul Mullowney    Collective on MPI_Comm
15539ae82921SPaul Mullowney 
15549ae82921SPaul Mullowney    Input Parameters:
15559ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
15569ae82921SPaul Mullowney .  m - number of rows
15579ae82921SPaul Mullowney .  n - number of columns
15589ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
15599ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
15600298fd71SBarry Smith          (possibly different for each row) or NULL
15619ae82921SPaul Mullowney 
15629ae82921SPaul Mullowney    Output Parameter:
15639ae82921SPaul Mullowney .  A - the matrix
15649ae82921SPaul Mullowney 
15659ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
15669ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
15679ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
15689ae82921SPaul Mullowney 
15699ae82921SPaul Mullowney    Notes:
15709ae82921SPaul Mullowney    If nnz is given then nz is ignored
15719ae82921SPaul Mullowney 
15729ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
15739ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
15749ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
15759ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
15769ae82921SPaul Mullowney 
15779ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
15780298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
15799ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
15809ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
15819ae82921SPaul Mullowney 
15829ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
15839ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
15849ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
15859ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
15869ae82921SPaul Mullowney 
15879ae82921SPaul Mullowney    Level: intermediate
15889ae82921SPaul Mullowney 
1589e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
15909ae82921SPaul Mullowney @*/
15919ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
15929ae82921SPaul Mullowney {
15939ae82921SPaul Mullowney   PetscErrorCode ierr;
15949ae82921SPaul Mullowney 
15959ae82921SPaul Mullowney   PetscFunctionBegin;
15969ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
15979ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
15989ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
15999ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
16009ae82921SPaul Mullowney   PetscFunctionReturn(0);
16019ae82921SPaul Mullowney }
16029ae82921SPaul Mullowney 
16039ae82921SPaul Mullowney #undef __FUNCT__
16049ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
16056fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
16069ae82921SPaul Mullowney {
16079ae82921SPaul Mullowney   PetscErrorCode     ierr;
1608*aa372e3fSPaul Mullowney   cusparseStatus_t   stat;
16099ae82921SPaul Mullowney 
16109ae82921SPaul Mullowney   PetscFunctionBegin;
16119ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
16129ae82921SPaul Mullowney     try {
16139ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1614*aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1615*aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1616*aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1617*aa372e3fSPaul Mullowney 
1618*aa372e3fSPaul Mullowney         /* delete all the members in each of the data structures */
1619*aa372e3fSPaul Mullowney         if (matstruct) {
1620*aa372e3fSPaul Mullowney           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1621*aa372e3fSPaul Mullowney           if (matstruct->mat) {
1622*aa372e3fSPaul Mullowney             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1623*aa372e3fSPaul Mullowney               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1624*aa372e3fSPaul Mullowney               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1625*aa372e3fSPaul Mullowney             } else {
1626*aa372e3fSPaul Mullowney 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1627*aa372e3fSPaul Mullowney 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1628*aa372e3fSPaul Mullowney 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1629*aa372e3fSPaul Mullowney 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1630*aa372e3fSPaul Mullowney               delete (CsrMatrix*)matstruct->mat;
16319ae82921SPaul Mullowney             }
1632*aa372e3fSPaul Mullowney           }
1633*aa372e3fSPaul Mullowney           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1634*aa372e3fSPaul Mullowney           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1635*aa372e3fSPaul Mullowney         }
1636*aa372e3fSPaul Mullowney         if (matstructT) {
1637*aa372e3fSPaul Mullowney           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1638*aa372e3fSPaul Mullowney           if (matstructT->mat) {
1639*aa372e3fSPaul Mullowney             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1640*aa372e3fSPaul Mullowney               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1641*aa372e3fSPaul Mullowney               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1642*aa372e3fSPaul Mullowney             } else {
1643*aa372e3fSPaul Mullowney 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1644*aa372e3fSPaul Mullowney 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1645*aa372e3fSPaul Mullowney 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1646*aa372e3fSPaul Mullowney 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1647*aa372e3fSPaul Mullowney               delete (CsrMatrix*)matstructT->mat;
1648*aa372e3fSPaul Mullowney             }
1649*aa372e3fSPaul Mullowney           }
1650*aa372e3fSPaul Mullowney           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1651*aa372e3fSPaul Mullowney           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1652*aa372e3fSPaul Mullowney         }
1653*aa372e3fSPaul Mullowney         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1654*aa372e3fSPaul Mullowney         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1655*aa372e3fSPaul Mullowney         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
16569ae82921SPaul Mullowney         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1657*aa372e3fSPaul Mullowney       } else {
1658*aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1659*aa372e3fSPaul Mullowney         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1660*aa372e3fSPaul Mullowney         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1661*aa372e3fSPaul Mullowney       }
16629ae82921SPaul Mullowney     } catch(char *ex) {
16639ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
16649ae82921SPaul Mullowney     }
16659ae82921SPaul Mullowney   } else {
1666e057df02SPaul Mullowney     /* The triangular factors */
16679ae82921SPaul Mullowney     try {
16689ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1669*aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1670*aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1671*aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1672*aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1673*aa372e3fSPaul Mullowney 
1674*aa372e3fSPaul Mullowney       /* delete all the members in each of the data structures */
1675*aa372e3fSPaul Mullowney       if (loTriFactor) {
1676*aa372e3fSPaul Mullowney         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1677*aa372e3fSPaul Mullowney         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1678*aa372e3fSPaul Mullowney         if (loTriFactor->csrMat) {
1679*aa372e3fSPaul Mullowney 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1680*aa372e3fSPaul Mullowney 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1681*aa372e3fSPaul Mullowney 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1682*aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)loTriFactor->csrMat;
1683*aa372e3fSPaul Mullowney 	}
1684*aa372e3fSPaul Mullowney         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1685*aa372e3fSPaul Mullowney       }
1686*aa372e3fSPaul Mullowney 
1687*aa372e3fSPaul Mullowney       if (upTriFactor) {
1688*aa372e3fSPaul Mullowney         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1689*aa372e3fSPaul Mullowney         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1690*aa372e3fSPaul Mullowney         if (upTriFactor->csrMat) {
1691*aa372e3fSPaul Mullowney 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1692*aa372e3fSPaul Mullowney 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1693*aa372e3fSPaul Mullowney 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1694*aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)upTriFactor->csrMat;
1695*aa372e3fSPaul Mullowney 	}
1696*aa372e3fSPaul Mullowney         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1697*aa372e3fSPaul Mullowney       }
1698*aa372e3fSPaul Mullowney 
1699*aa372e3fSPaul Mullowney       if (loTriFactorT) {
1700*aa372e3fSPaul Mullowney         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1701*aa372e3fSPaul Mullowney         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1702*aa372e3fSPaul Mullowney         if (loTriFactorT->csrMat) {
1703*aa372e3fSPaul Mullowney 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1704*aa372e3fSPaul Mullowney 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1705*aa372e3fSPaul Mullowney 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1706*aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1707*aa372e3fSPaul Mullowney 	}
1708*aa372e3fSPaul Mullowney         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1709*aa372e3fSPaul Mullowney       }
1710*aa372e3fSPaul Mullowney 
1711*aa372e3fSPaul Mullowney       if (upTriFactorT) {
1712*aa372e3fSPaul Mullowney         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1713*aa372e3fSPaul Mullowney         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1714*aa372e3fSPaul Mullowney         if (upTriFactorT->csrMat) {
1715*aa372e3fSPaul Mullowney 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1716*aa372e3fSPaul Mullowney 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1717*aa372e3fSPaul Mullowney 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1718*aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1719*aa372e3fSPaul Mullowney 	}
1720*aa372e3fSPaul Mullowney         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1721*aa372e3fSPaul Mullowney       }
1722*aa372e3fSPaul Mullowney       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1723*aa372e3fSPaul Mullowney       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1724*aa372e3fSPaul Mullowney       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1725*aa372e3fSPaul Mullowney       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1726*aa372e3fSPaul Mullowney       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
17279ae82921SPaul Mullowney     } catch(char *ex) {
17289ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
17299ae82921SPaul Mullowney     }
17309ae82921SPaul Mullowney   }
17319ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
17329ae82921SPaul Mullowney   A->spptr = 0;
17339ae82921SPaul Mullowney 
17349ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
17359ae82921SPaul Mullowney   PetscFunctionReturn(0);
17369ae82921SPaul Mullowney }
17379ae82921SPaul Mullowney 
17389ae82921SPaul Mullowney #undef __FUNCT__
17399ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
17408cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
17419ae82921SPaul Mullowney {
17429ae82921SPaul Mullowney   PetscErrorCode ierr;
1743*aa372e3fSPaul Mullowney   cusparseStatus_t stat;
1744*aa372e3fSPaul Mullowney   cusparseHandle_t handle=0;
17459ae82921SPaul Mullowney 
17469ae82921SPaul Mullowney   PetscFunctionBegin;
17479ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
17489ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1749e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1750e057df02SPaul Mullowney        now build a GPU matrix data structure */
17519ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
17529ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1753*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1754*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1755e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1756*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1757*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1758*aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1759*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1760*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
17619ae82921SPaul Mullowney   } else {
17629ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1763debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
17649ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
17659ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1766*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1767*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1768*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1769*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1770*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1771e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format                  = cusparseMatSolveStorageFormat;
1772*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1773*aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1774*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1775*aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
17769ae82921SPaul Mullowney   }
1777*aa372e3fSPaul Mullowney 
1778e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1779e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
1780e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1781bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
17822205254eSKarl Rupp 
17839ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
17849ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
17859ae82921SPaul Mullowney   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
17869ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1787ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1788ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1789ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1790ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
17912205254eSKarl Rupp 
17929ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
17932205254eSKarl Rupp 
17949ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
17952205254eSKarl Rupp 
1796bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
17979ae82921SPaul Mullowney   PetscFunctionReturn(0);
17989ae82921SPaul Mullowney }
17999ae82921SPaul Mullowney 
1800e057df02SPaul Mullowney /*M
1801e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1802e057df02SPaul Mullowney 
1803e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1804e057df02SPaul Mullowney    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1805*aa372e3fSPaul Mullowney    the CUSPARSE library.
1806e057df02SPaul Mullowney 
1807e057df02SPaul Mullowney    Options Database Keys:
1808e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1809*aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1810*aa372e3fSPaul Mullowney .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1811*aa372e3fSPaul Mullowney -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1812e057df02SPaul Mullowney 
1813e057df02SPaul Mullowney   Level: beginner
1814e057df02SPaul Mullowney 
18158468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1816e057df02SPaul Mullowney M*/
1817