xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ce8146529ec8918a9e13a256688e619a410abb90)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
63d13b8fdSMatthew G. Knepley #include <petscconf.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
93d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
103d13b8fdSMatthew G. Knepley #include <petsc-private/vecimpl.h>
119ae82921SPaul Mullowney #undef VecType
123d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13bc3f50f2SPaul Mullowney 
14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
159ae82921SPaul Mullowney 
16087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19087f3262SPaul Mullowney 
206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
216fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
226fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23087f3262SPaul Mullowney 
246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
296fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
316fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
326fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
339ae82921SPaul Mullowney 
347f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
357f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
367f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
377f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
387f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
397f756511SDominic Meiser 
409ae82921SPaul Mullowney #undef __FUNCT__
41b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetStream"
42b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43b06137fdSPaul Mullowney {
44b06137fdSPaul Mullowney   cusparseStatus_t   stat;
45b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46b06137fdSPaul Mullowney 
47b06137fdSPaul Mullowney   PetscFunctionBegin;
48b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
49b06137fdSPaul Mullowney   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
50b06137fdSPaul Mullowney   PetscFunctionReturn(0);
51b06137fdSPaul Mullowney }
52b06137fdSPaul Mullowney 
53b06137fdSPaul Mullowney #undef __FUNCT__
54b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetHandle"
55b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56b06137fdSPaul Mullowney {
57b06137fdSPaul Mullowney   cusparseStatus_t   stat;
58b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59b06137fdSPaul Mullowney 
60b06137fdSPaul Mullowney   PetscFunctionBegin;
61b06137fdSPaul Mullowney   if (cusparsestruct->handle)
62b06137fdSPaul Mullowney     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
63b06137fdSPaul Mullowney   cusparsestruct->handle = handle;
64b06137fdSPaul Mullowney   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
65b06137fdSPaul Mullowney   PetscFunctionReturn(0);
66b06137fdSPaul Mullowney }
67b06137fdSPaul Mullowney 
68b06137fdSPaul Mullowney #undef __FUNCT__
69b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSEClearHandle"
70b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71b06137fdSPaul Mullowney {
72b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73b06137fdSPaul Mullowney   PetscFunctionBegin;
74b06137fdSPaul Mullowney   if (cusparsestruct->handle)
75b06137fdSPaul Mullowney     cusparsestruct->handle = 0;
76b06137fdSPaul Mullowney   PetscFunctionReturn(0);
77b06137fdSPaul Mullowney }
78b06137fdSPaul Mullowney 
79b06137fdSPaul Mullowney #undef __FUNCT__
809ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
819ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
829ae82921SPaul Mullowney {
839ae82921SPaul Mullowney   PetscFunctionBegin;
849ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
859ae82921SPaul Mullowney   PetscFunctionReturn(0);
869ae82921SPaul Mullowney }
879ae82921SPaul Mullowney 
88c708e6cdSJed Brown /*MC
89087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
90087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
91087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
92087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
93087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
94087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
95c708e6cdSJed Brown 
969ae82921SPaul Mullowney   Level: beginner
97c708e6cdSJed Brown 
98c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99c708e6cdSJed Brown M*/
1009ae82921SPaul Mullowney 
1019ae82921SPaul Mullowney #undef __FUNCT__
1029ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
1033c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
1049ae82921SPaul Mullowney {
1059ae82921SPaul Mullowney   PetscErrorCode ierr;
106bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1079ae82921SPaul Mullowney 
1089ae82921SPaul Mullowney   PetscFunctionBegin;
109bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110404133a2SPaul Mullowney   (*B)->factortype = ftype;
111bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1129ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1132205254eSKarl Rupp 
114087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
11533d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1169ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1179ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1219ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122bc3f50f2SPaul Mullowney 
123fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
12462a20339SJed Brown   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
1259ae82921SPaul Mullowney   PetscFunctionReturn(0);
1269ae82921SPaul Mullowney }
1279ae82921SPaul Mullowney 
1289ae82921SPaul Mullowney #undef __FUNCT__
129e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131ca45077fSPaul Mullowney {
132aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1336e111a19SKarl Rupp 
134ca45077fSPaul Mullowney   PetscFunctionBegin;
1352692e278SPaul Mullowney #if CUDA_VERSION>=4020
136ca45077fSPaul Mullowney   switch (op) {
137e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
138aa372e3fSPaul Mullowney     cusparsestruct->format = format;
139ca45077fSPaul Mullowney     break;
140e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
141aa372e3fSPaul Mullowney     cusparsestruct->format = format;
142ca45077fSPaul Mullowney     break;
143ca45077fSPaul Mullowney   default:
14436d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145ca45077fSPaul Mullowney   }
1462692e278SPaul Mullowney #else
1472692e278SPaul Mullowney   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
1482692e278SPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
1492692e278SPaul Mullowney #endif
150ca45077fSPaul Mullowney   PetscFunctionReturn(0);
151ca45077fSPaul Mullowney }
1529ae82921SPaul Mullowney 
153e057df02SPaul Mullowney /*@
154e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
156aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
157e057df02SPaul Mullowney    Not Collective
158e057df02SPaul Mullowney 
159e057df02SPaul Mullowney    Input Parameters:
1608468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
16136d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1622692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
163e057df02SPaul Mullowney 
164e057df02SPaul Mullowney    Output Parameter:
165e057df02SPaul Mullowney 
166e057df02SPaul Mullowney    Level: intermediate
167e057df02SPaul Mullowney 
1688468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169e057df02SPaul Mullowney @*/
170e057df02SPaul Mullowney #undef __FUNCT__
171e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
172e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173e057df02SPaul Mullowney {
174e057df02SPaul Mullowney   PetscErrorCode ierr;
1756e111a19SKarl Rupp 
176e057df02SPaul Mullowney   PetscFunctionBegin;
177e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
178e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
179e057df02SPaul Mullowney   PetscFunctionReturn(0);
180e057df02SPaul Mullowney }
181e057df02SPaul Mullowney 
1829ae82921SPaul Mullowney #undef __FUNCT__
1839ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1846fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1859ae82921SPaul Mullowney {
1869ae82921SPaul Mullowney   PetscErrorCode           ierr;
187e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1889ae82921SPaul Mullowney   PetscBool                flg;
1896e111a19SKarl Rupp 
1909ae82921SPaul Mullowney   PetscFunctionBegin;
191e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
192e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1939ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
194e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
1957083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196e057df02SPaul Mullowney     if (flg) {
197e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
198045c96e1SPaul Mullowney     }
1999ae82921SPaul Mullowney   }
2004c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
2017083f85cSSatish Balay                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
2024c87dfd4SPaul Mullowney   if (flg) {
2034c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
2044c87dfd4SPaul Mullowney   }
2059ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2069ae82921SPaul Mullowney   PetscFunctionReturn(0);
2079ae82921SPaul Mullowney 
2089ae82921SPaul Mullowney }
2099ae82921SPaul Mullowney 
2109ae82921SPaul Mullowney #undef __FUNCT__
2119ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
2126fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2139ae82921SPaul Mullowney {
2149ae82921SPaul Mullowney   PetscErrorCode ierr;
2159ae82921SPaul Mullowney 
2169ae82921SPaul Mullowney   PetscFunctionBegin;
2179ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2189ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2199ae82921SPaul Mullowney   PetscFunctionReturn(0);
2209ae82921SPaul Mullowney }
2219ae82921SPaul Mullowney 
2229ae82921SPaul Mullowney #undef __FUNCT__
2239ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2246fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2259ae82921SPaul Mullowney {
2269ae82921SPaul Mullowney   PetscErrorCode ierr;
2279ae82921SPaul Mullowney 
2289ae82921SPaul Mullowney   PetscFunctionBegin;
2299ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2309ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2319ae82921SPaul Mullowney   PetscFunctionReturn(0);
2329ae82921SPaul Mullowney }
2339ae82921SPaul Mullowney 
2349ae82921SPaul Mullowney #undef __FUNCT__
235087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
236087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
237087f3262SPaul Mullowney {
238087f3262SPaul Mullowney   PetscErrorCode ierr;
239087f3262SPaul Mullowney 
240087f3262SPaul Mullowney   PetscFunctionBegin;
241087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
242087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
243087f3262SPaul Mullowney   PetscFunctionReturn(0);
244087f3262SPaul Mullowney }
245087f3262SPaul Mullowney 
246087f3262SPaul Mullowney #undef __FUNCT__
247087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
248087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249087f3262SPaul Mullowney {
250087f3262SPaul Mullowney   PetscErrorCode ierr;
251087f3262SPaul Mullowney 
252087f3262SPaul Mullowney   PetscFunctionBegin;
253087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255087f3262SPaul Mullowney   PetscFunctionReturn(0);
256087f3262SPaul Mullowney }
257087f3262SPaul Mullowney 
258087f3262SPaul Mullowney #undef __FUNCT__
259087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
260087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2619ae82921SPaul Mullowney {
2629ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2639ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
265aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2669ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2679ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2689ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2699ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2709ae82921SPaul Mullowney   PetscScalar                       *AALo;
2719ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
272b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
2739ae82921SPaul Mullowney 
2749ae82921SPaul Mullowney   PetscFunctionBegin;
2759ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2769ae82921SPaul Mullowney     try {
2779ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2789ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2799ae82921SPaul Mullowney 
2809ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2819ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2829ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2839ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2849ae82921SPaul Mullowney 
2859ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2869ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2879ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2889ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2899ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2909ae82921SPaul Mullowney       v        = aa;
2919ae82921SPaul Mullowney       vi       = aj;
2929ae82921SPaul Mullowney       offset   = 1;
2939ae82921SPaul Mullowney       rowOffset= 1;
2949ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2959ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
296e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2979ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2989ae82921SPaul Mullowney         rowOffset += nz+1;
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
3019ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
3029ae82921SPaul Mullowney 
3039ae82921SPaul Mullowney         offset      += nz;
3049ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3059ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3069ae82921SPaul Mullowney         offset      += 1;
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney         v  += nz;
3099ae82921SPaul Mullowney         vi += nz;
3109ae82921SPaul Mullowney       }
3112205254eSKarl Rupp 
312aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
313aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3142205254eSKarl Rupp 
315aa372e3fSPaul Mullowney       /* Create the matrix description */
316aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
317aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
318aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
319aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
320aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
321aa372e3fSPaul Mullowney 
322aa372e3fSPaul Mullowney       /* Create the solve analysis information */
323aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
324aa372e3fSPaul Mullowney 
325aa372e3fSPaul Mullowney       /* set the operation */
326aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
327aa372e3fSPaul Mullowney 
328aa372e3fSPaul Mullowney       /* set the matrix */
329aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
330aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
331aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
332aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
333aa372e3fSPaul Mullowney 
334aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
335aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
336aa372e3fSPaul Mullowney 
337aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
338aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
339aa372e3fSPaul Mullowney 
340aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
341aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
342aa372e3fSPaul Mullowney 
343aa372e3fSPaul Mullowney       /* perform the solve analysis */
344aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
345aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
346aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
347aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
348aa372e3fSPaul Mullowney 
349aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
350aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3512205254eSKarl Rupp 
3529ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
3539ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
3549ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
3559ae82921SPaul Mullowney     } catch(char *ex) {
3569ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3579ae82921SPaul Mullowney     }
3589ae82921SPaul Mullowney   }
3599ae82921SPaul Mullowney   PetscFunctionReturn(0);
3609ae82921SPaul Mullowney }
3619ae82921SPaul Mullowney 
3629ae82921SPaul Mullowney #undef __FUNCT__
363087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
364087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3659ae82921SPaul Mullowney {
3669ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3679ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
369aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3709ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3719ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3729ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3739ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3749ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3759ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3769ae82921SPaul Mullowney   PetscErrorCode                    ierr;
3779ae82921SPaul Mullowney 
3789ae82921SPaul Mullowney   PetscFunctionBegin;
3799ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
3809ae82921SPaul Mullowney     try {
3819ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3829ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3859ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3869ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3879ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3889ae82921SPaul Mullowney 
3899ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3909ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3919ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3929ae82921SPaul Mullowney       offset = nzUpper;
3939ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3949ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3959ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3969ae82921SPaul Mullowney 
397e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3989ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3999ae82921SPaul Mullowney 
400e057df02SPaul Mullowney         /* decrement the offset */
4019ae82921SPaul Mullowney         offset -= (nz+1);
4029ae82921SPaul Mullowney 
403e057df02SPaul Mullowney         /* first, set the diagonal elements */
4049ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
4059ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
4069ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4079ae82921SPaul Mullowney 
4089ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
4099ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
4109ae82921SPaul Mullowney       }
4112205254eSKarl Rupp 
412aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
413aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4142205254eSKarl Rupp 
415aa372e3fSPaul Mullowney       /* Create the matrix description */
416aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
417aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
418aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
419aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
420aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney       /* Create the solve analysis information */
423aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
424aa372e3fSPaul Mullowney 
425aa372e3fSPaul Mullowney       /* set the operation */
426aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
427aa372e3fSPaul Mullowney 
428aa372e3fSPaul Mullowney       /* set the matrix */
429aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
430aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
431aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
432aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
433aa372e3fSPaul Mullowney 
434aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
435aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
436aa372e3fSPaul Mullowney 
437aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
438aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
439aa372e3fSPaul Mullowney 
440aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
441aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
442aa372e3fSPaul Mullowney 
443aa372e3fSPaul Mullowney       /* perform the solve analysis */
444aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
445aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
446aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
447aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
450aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4512205254eSKarl Rupp 
4529ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
4539ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
4549ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
4559ae82921SPaul Mullowney     } catch(char *ex) {
4569ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4579ae82921SPaul Mullowney     }
4589ae82921SPaul Mullowney   }
4599ae82921SPaul Mullowney   PetscFunctionReturn(0);
4609ae82921SPaul Mullowney }
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney #undef __FUNCT__
463087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
464087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4659ae82921SPaul Mullowney {
4669ae82921SPaul Mullowney   PetscErrorCode               ierr;
4679ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4699ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4709ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4719ae82921SPaul Mullowney   const PetscInt               *r,*c;
4729ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney   PetscFunctionBegin;
475087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
476087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4772205254eSKarl Rupp 
478aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
479aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
480aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
4819ae82921SPaul Mullowney 
4829ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
483e057df02SPaul Mullowney   /*lower triangular indices */
4849ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4859ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4862205254eSKarl Rupp   if (!row_identity) {
487aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
488aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
4892205254eSKarl Rupp   }
4909ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4919ae82921SPaul Mullowney 
492e057df02SPaul Mullowney   /*upper triangular indices */
4939ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4949ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4952205254eSKarl Rupp   if (!col_identity) {
496aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
497aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
4982205254eSKarl Rupp   }
4999ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
5009ae82921SPaul Mullowney   PetscFunctionReturn(0);
5019ae82921SPaul Mullowney }
5029ae82921SPaul Mullowney 
5039ae82921SPaul Mullowney #undef __FUNCT__
504087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
505087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
506087f3262SPaul Mullowney {
507087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
508087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
509aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
510aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
511087f3262SPaul Mullowney   cusparseStatus_t                  stat;
512087f3262SPaul Mullowney   PetscErrorCode                    ierr;
513087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
514087f3262SPaul Mullowney   PetscScalar                       *AAUp;
515087f3262SPaul Mullowney   PetscScalar                       *AALo;
516087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
517087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
518087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
519087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
520087f3262SPaul Mullowney 
521087f3262SPaul Mullowney   PetscFunctionBegin;
522087f3262SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
523087f3262SPaul Mullowney     try {
524087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
525087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
526087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
527087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
528087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529087f3262SPaul Mullowney 
530087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
531087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
532087f3262SPaul Mullowney       AiUp[n]=nzUpper;
533087f3262SPaul Mullowney       offset = 0;
534087f3262SPaul Mullowney       for (i=0; i<n; i++) {
535087f3262SPaul Mullowney         /* set the pointers */
536087f3262SPaul Mullowney         v  = aa + ai[i];
537087f3262SPaul Mullowney         vj = aj + ai[i];
538087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
539087f3262SPaul Mullowney 
540087f3262SPaul Mullowney         /* first, set the diagonal elements */
541087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
542087f3262SPaul Mullowney         AAUp[offset] = 1.0/v[nz];
543087f3262SPaul Mullowney         AiUp[i]      = offset;
544087f3262SPaul Mullowney         AALo[offset] = 1.0/v[nz];
545087f3262SPaul Mullowney 
546087f3262SPaul Mullowney         offset+=1;
547087f3262SPaul Mullowney         if (nz>0) {
548087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
549087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
550087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
551087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
552087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
553087f3262SPaul Mullowney           }
554087f3262SPaul Mullowney           offset+=nz;
555087f3262SPaul Mullowney         }
556087f3262SPaul Mullowney       }
557087f3262SPaul Mullowney 
558aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
559aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
560087f3262SPaul Mullowney 
561aa372e3fSPaul Mullowney       /* Create the matrix description */
562aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
563aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
564aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
565aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
566aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
567087f3262SPaul Mullowney 
568aa372e3fSPaul Mullowney       /* Create the solve analysis information */
569aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
570aa372e3fSPaul Mullowney 
571aa372e3fSPaul Mullowney       /* set the operation */
572aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
573aa372e3fSPaul Mullowney 
574aa372e3fSPaul Mullowney       /* set the matrix */
575aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
576aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
577aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
578aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
579aa372e3fSPaul Mullowney 
580aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
581aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
582aa372e3fSPaul Mullowney 
583aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
584aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
585aa372e3fSPaul Mullowney 
586aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
587aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
588aa372e3fSPaul Mullowney 
589aa372e3fSPaul Mullowney       /* perform the solve analysis */
590aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
594aa372e3fSPaul Mullowney 
595aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
596aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
597aa372e3fSPaul Mullowney 
598aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
599aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
600aa372e3fSPaul Mullowney 
601aa372e3fSPaul Mullowney       /* Create the matrix description */
602aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
603aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
604aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
605aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
606aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
607aa372e3fSPaul Mullowney 
608aa372e3fSPaul Mullowney       /* Create the solve analysis information */
609aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
610aa372e3fSPaul Mullowney 
611aa372e3fSPaul Mullowney       /* set the operation */
612aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
613aa372e3fSPaul Mullowney 
614aa372e3fSPaul Mullowney       /* set the matrix */
615aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
616aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
617aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
618aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
619aa372e3fSPaul Mullowney 
620aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
621aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
622aa372e3fSPaul Mullowney 
623aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
624aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
625aa372e3fSPaul Mullowney 
626aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
627aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
628aa372e3fSPaul Mullowney 
629aa372e3fSPaul Mullowney       /* perform the solve analysis */
630aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
631aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
632aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
633aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
634aa372e3fSPaul Mullowney 
635aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
636aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
637087f3262SPaul Mullowney 
638087f3262SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
639087f3262SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
640087f3262SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
641087f3262SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
642087f3262SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
643087f3262SPaul Mullowney     } catch(char *ex) {
644087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
645087f3262SPaul Mullowney     }
646087f3262SPaul Mullowney   }
647087f3262SPaul Mullowney   PetscFunctionReturn(0);
648087f3262SPaul Mullowney }
649087f3262SPaul Mullowney 
650087f3262SPaul Mullowney #undef __FUNCT__
651087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
652087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6539ae82921SPaul Mullowney {
6549ae82921SPaul Mullowney   PetscErrorCode               ierr;
655087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
656087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
657087f3262SPaul Mullowney   IS                           ip = a->row;
658087f3262SPaul Mullowney   const PetscInt               *rip;
659087f3262SPaul Mullowney   PetscBool                    perm_identity;
660087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
661087f3262SPaul Mullowney 
662087f3262SPaul Mullowney   PetscFunctionBegin;
663087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
664aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
665aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
666aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
667aa372e3fSPaul Mullowney 
668087f3262SPaul Mullowney   /*lower triangular indices */
669087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
670087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
671087f3262SPaul Mullowney   if (!perm_identity) {
672aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
673aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
674aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
676087f3262SPaul Mullowney   }
677087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
678087f3262SPaul Mullowney   PetscFunctionReturn(0);
679087f3262SPaul Mullowney }
680087f3262SPaul Mullowney 
681087f3262SPaul Mullowney #undef __FUNCT__
6829ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
6836fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
6849ae82921SPaul Mullowney {
6859ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
6869ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
6879ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
688b175d8bbSPaul Mullowney   PetscErrorCode ierr;
6899ae82921SPaul Mullowney 
6909ae82921SPaul Mullowney   PetscFunctionBegin;
6919ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
692e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
6939ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6949ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
695bda325fcSPaul Mullowney   if (row_identity && col_identity) {
696bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
697bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
698bda325fcSPaul Mullowney   } else {
699bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
700bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
701bda325fcSPaul Mullowney   }
7028dc1d2a3SPaul Mullowney 
703e057df02SPaul Mullowney   /* get the triangular factors */
704087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
7059ae82921SPaul Mullowney   PetscFunctionReturn(0);
7069ae82921SPaul Mullowney }
7079ae82921SPaul Mullowney 
708087f3262SPaul Mullowney #undef __FUNCT__
709087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
710087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
711087f3262SPaul Mullowney {
712087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
713087f3262SPaul Mullowney   IS             ip = b->row;
714087f3262SPaul Mullowney   PetscBool      perm_identity;
715b175d8bbSPaul Mullowney   PetscErrorCode ierr;
716087f3262SPaul Mullowney 
717087f3262SPaul Mullowney   PetscFunctionBegin;
718087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
719087f3262SPaul Mullowney 
720087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
721087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
722087f3262SPaul Mullowney   if (perm_identity) {
723087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
724087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
725087f3262SPaul Mullowney   } else {
726087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728087f3262SPaul Mullowney   }
729087f3262SPaul Mullowney 
730087f3262SPaul Mullowney   /* get the triangular factors */
731087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
732087f3262SPaul Mullowney   PetscFunctionReturn(0);
733087f3262SPaul Mullowney }
7349ae82921SPaul Mullowney 
735bda325fcSPaul Mullowney #undef __FUNCT__
736bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
737b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
738bda325fcSPaul Mullowney {
739bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
740aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
741aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
742aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
743aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
744bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
745aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
746aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
747aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
748aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
749b175d8bbSPaul Mullowney 
750bda325fcSPaul Mullowney   PetscFunctionBegin;
751bda325fcSPaul Mullowney 
752aa372e3fSPaul Mullowney   /*********************************************/
753aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
754aa372e3fSPaul Mullowney   /*********************************************/
755aa372e3fSPaul Mullowney 
756aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
757aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
758aa372e3fSPaul Mullowney 
759aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
760aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
761aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
762aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
763aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
764aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
765aa372e3fSPaul Mullowney 
766aa372e3fSPaul Mullowney   /* Create the matrix description */
767aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
768aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
769aa372e3fSPaul Mullowney   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
770aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
771aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
772aa372e3fSPaul Mullowney 
773aa372e3fSPaul Mullowney   /* Create the solve analysis information */
774aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
775aa372e3fSPaul Mullowney 
776aa372e3fSPaul Mullowney   /* set the operation */
777aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
778aa372e3fSPaul Mullowney 
779aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
780aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
781aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
782aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
783aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
784aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
785aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
786aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
787aa372e3fSPaul Mullowney 
788aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
789aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
790aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
791aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
792aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
793aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
794aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
795aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
796aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
797aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
798aa372e3fSPaul Mullowney 
799aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
800aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
801aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
802aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
803aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
804aa372e3fSPaul Mullowney                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
805aa372e3fSPaul Mullowney 
806aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
807aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
808aa372e3fSPaul Mullowney 
809aa372e3fSPaul Mullowney   /*********************************************/
810aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
811aa372e3fSPaul Mullowney   /*********************************************/
812aa372e3fSPaul Mullowney 
813aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
814aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
817aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
818aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
819aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
820aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
821aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney   /* Create the matrix description */
824aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
825aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
826aa372e3fSPaul Mullowney   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
827aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
828aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
829aa372e3fSPaul Mullowney 
830aa372e3fSPaul Mullowney   /* Create the solve analysis information */
831aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
832aa372e3fSPaul Mullowney 
833aa372e3fSPaul Mullowney   /* set the operation */
834aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
835aa372e3fSPaul Mullowney 
836aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
837aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
838aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
839aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
840aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
841aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
842aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
843aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
846aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
847aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
848aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
849aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
850aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
851aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
852aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
853aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
854aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
855aa372e3fSPaul Mullowney 
856aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
857aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
858aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
859aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
860aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
861aa372e3fSPaul Mullowney                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
862aa372e3fSPaul Mullowney 
863aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
864aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
865bda325fcSPaul Mullowney   PetscFunctionReturn(0);
866bda325fcSPaul Mullowney }
867bda325fcSPaul Mullowney 
868bda325fcSPaul Mullowney #undef __FUNCT__
869bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
870b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
871bda325fcSPaul Mullowney {
872aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
873aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
874aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
875bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
876bda325fcSPaul Mullowney   cusparseStatus_t             stat;
877aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
878b06137fdSPaul Mullowney   cudaError_t                  err;
879b175d8bbSPaul Mullowney 
880bda325fcSPaul Mullowney   PetscFunctionBegin;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney   /* allocate space for the triangular factor information */
883aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
884aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
885aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
886aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
887aa372e3fSPaul Mullowney   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
888aa372e3fSPaul Mullowney 
889b06137fdSPaul Mullowney   /* set alpha and beta */
890b06137fdSPaul Mullowney   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
891b06137fdSPaul Mullowney   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
892b06137fdSPaul Mullowney   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
893b06137fdSPaul Mullowney   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
894b06137fdSPaul Mullowney   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
895b06137fdSPaul Mullowney 
896aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
897aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
898aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
899aa372e3fSPaul Mullowney     matrixT->num_rows = A->rmap->n;
900aa372e3fSPaul Mullowney     matrixT->num_cols = A->cmap->n;
901aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
902aa372e3fSPaul Mullowney     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
903aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
904aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
905aa372e3fSPaul Mullowney 
906aa372e3fSPaul Mullowney     /* compute the transpose of the upper triangular factor, i.e. the CSC */
907aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
908aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
909aa372e3fSPaul Mullowney                             matrix->num_cols, matrix->num_entries,
910aa372e3fSPaul Mullowney                             matrix->values->data().get(),
911aa372e3fSPaul Mullowney                             matrix->row_offsets->data().get(),
912aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
913aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
914aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
915aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
916aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
917aa372e3fSPaul Mullowney 
918aa372e3fSPaul Mullowney     /* assign the pointer */
919aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
920aa372e3fSPaul Mullowney 
921aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
9222692e278SPaul Mullowney #if CUDA_VERSION>=5000
923aa372e3fSPaul Mullowney     /* First convert HYB to CSR */
924aa372e3fSPaul Mullowney     CsrMatrix *temp= new CsrMatrix;
925aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
926aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
927aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
928aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
929aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
930aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
931aa372e3fSPaul Mullowney 
9322692e278SPaul Mullowney 
933aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
934aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
935aa372e3fSPaul Mullowney                             temp->values->data().get(),
936aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
937aa372e3fSPaul Mullowney                             temp->column_indices->data().get());CHKERRCUSP(stat);
938aa372e3fSPaul Mullowney 
939aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
940aa372e3fSPaul Mullowney     CsrMatrix *tempT= new CsrMatrix;
941aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
942aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
943aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
944aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
945aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
946aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
947aa372e3fSPaul Mullowney 
948aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
949aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
950aa372e3fSPaul Mullowney                             temp->values->data().get(),
951aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
952aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
953aa372e3fSPaul Mullowney                             tempT->values->data().get(),
954aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
955aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
956aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
957aa372e3fSPaul Mullowney 
958aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
959aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
960aa372e3fSPaul Mullowney     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
961aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
962aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
963aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
964aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
965aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
966aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
967aa372e3fSPaul Mullowney                             hybMat, 0, partition);CHKERRCUSP(stat);
968aa372e3fSPaul Mullowney 
969aa372e3fSPaul Mullowney     /* assign the pointer */
970aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
971aa372e3fSPaul Mullowney 
972aa372e3fSPaul Mullowney     /* delete temporaries */
973aa372e3fSPaul Mullowney     if (tempT) {
974aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
975aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
976aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
977aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
978087f3262SPaul Mullowney     }
979aa372e3fSPaul Mullowney     if (temp) {
980aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
981aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
982aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
983aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
984aa372e3fSPaul Mullowney     }
9852692e278SPaul Mullowney #else
9862692e278SPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
9872692e278SPaul Mullowney #endif
988aa372e3fSPaul Mullowney   }
989aa372e3fSPaul Mullowney   /* assign the compressed row indices */
990aa372e3fSPaul Mullowney   matstructT->cprowIndices = new THRUSTINTARRAY;
991aa372e3fSPaul Mullowney 
992aa372e3fSPaul Mullowney   /* assign the pointer */
993aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
994bda325fcSPaul Mullowney   PetscFunctionReturn(0);
995bda325fcSPaul Mullowney }
996bda325fcSPaul Mullowney 
997bda325fcSPaul Mullowney #undef __FUNCT__
998bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
9996fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1000bda325fcSPaul Mullowney {
1001bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU, *bGPU;
1002bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1003bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1004aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1005aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1006aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1007b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1008bda325fcSPaul Mullowney 
1009bda325fcSPaul Mullowney   PetscFunctionBegin;
1010aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1011aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1012bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1013aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1014aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1015bda325fcSPaul Mullowney   }
1016bda325fcSPaul Mullowney 
1017bda325fcSPaul Mullowney   /* Get the GPU pointers */
1018bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1019bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1020bda325fcSPaul Mullowney 
1021aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1022aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1023aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1024aa372e3fSPaul Mullowney                xGPU->begin());
1025aa372e3fSPaul Mullowney 
1026aa372e3fSPaul Mullowney   /* First, solve U */
1027aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1029aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1030aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1031aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1032aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1033aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1034aa372e3fSPaul Mullowney 
1035aa372e3fSPaul Mullowney   /* Then, solve L */
1036aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1038aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1039aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1040aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1041aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1042aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1043aa372e3fSPaul Mullowney 
1044aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1046aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1047aa372e3fSPaul Mullowney                tempGPU->begin());
1048aa372e3fSPaul Mullowney 
1049aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1050aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1051bda325fcSPaul Mullowney 
1052bda325fcSPaul Mullowney   /* restore */
1053bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1054bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1055bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1056087f3262SPaul Mullowney 
1057aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1058bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1059bda325fcSPaul Mullowney }
1060bda325fcSPaul Mullowney 
1061bda325fcSPaul Mullowney #undef __FUNCT__
1062bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
10636fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1064bda325fcSPaul Mullowney {
1065bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
1066bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1067bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1072bda325fcSPaul Mullowney 
1073bda325fcSPaul Mullowney   PetscFunctionBegin;
1074aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1075aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1076bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1077aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079bda325fcSPaul Mullowney   }
1080bda325fcSPaul Mullowney 
1081bda325fcSPaul Mullowney   /* Get the GPU pointers */
1082bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1083bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1084bda325fcSPaul Mullowney 
1085aa372e3fSPaul Mullowney   /* First, solve U */
1086aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1087aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1088aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1089aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1090aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1091aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1092aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1093aa372e3fSPaul Mullowney 
1094aa372e3fSPaul Mullowney   /* Then, solve L */
1095aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1097aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1098aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1099aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1100aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1101aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1102bda325fcSPaul Mullowney 
1103bda325fcSPaul Mullowney   /* restore */
1104bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1105bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1106bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1107aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1108bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1109bda325fcSPaul Mullowney }
1110bda325fcSPaul Mullowney 
11119ae82921SPaul Mullowney #undef __FUNCT__
11129ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
11136fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11149ae82921SPaul Mullowney {
1115bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
11169ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11179ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1118aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1119aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1120aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1121b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11229ae82921SPaul Mullowney 
11239ae82921SPaul Mullowney   PetscFunctionBegin;
1124e057df02SPaul Mullowney   /* Get the GPU pointers */
11259ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11269ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
11279ae82921SPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1129aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1130aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1131aa372e3fSPaul Mullowney                xGPU->begin());
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /* Next, solve L */
1134aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1135aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1136aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1137aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1138aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1139aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1140aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1141aa372e3fSPaul Mullowney 
1142aa372e3fSPaul Mullowney   /* Then, solve U */
1143aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1144aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1145aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1146aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1147aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1148aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1149aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1150aa372e3fSPaul Mullowney 
1151aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1152aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1153aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1154aa372e3fSPaul Mullowney                tempGPU->begin());
1155aa372e3fSPaul Mullowney 
1156aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1157aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
11589ae82921SPaul Mullowney 
11599ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
11609ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11619ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1162aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
11639ae82921SPaul Mullowney   PetscFunctionReturn(0);
11649ae82921SPaul Mullowney }
11659ae82921SPaul Mullowney 
11669ae82921SPaul Mullowney #undef __FUNCT__
11679ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
11686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
11699ae82921SPaul Mullowney {
1170bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
11719ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11729ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1173aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1174aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1175aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1176b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11779ae82921SPaul Mullowney 
11789ae82921SPaul Mullowney   PetscFunctionBegin;
1179e057df02SPaul Mullowney   /* Get the GPU pointers */
11809ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11819ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
11829ae82921SPaul Mullowney 
1183aa372e3fSPaul Mullowney   /* First, solve L */
1184aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1186aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1187aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1188aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1189aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1190aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1191aa372e3fSPaul Mullowney 
1192aa372e3fSPaul Mullowney   /* Next, solve U */
1193aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1195aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1196aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1197aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1198aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1199aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
12009ae82921SPaul Mullowney 
12019ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
12029ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
12039ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1204aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12059ae82921SPaul Mullowney   PetscFunctionReturn(0);
12069ae82921SPaul Mullowney }
12079ae82921SPaul Mullowney 
12089ae82921SPaul Mullowney #undef __FUNCT__
1209e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
12106fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12119ae82921SPaul Mullowney {
12129ae82921SPaul Mullowney 
1213aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
12159ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
12169ae82921SPaul Mullowney   PetscInt                     m = A->rmap->n,*ii,*ridx;
12179ae82921SPaul Mullowney   PetscErrorCode               ierr;
1218aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1219b06137fdSPaul Mullowney   cudaError_t                  err;
12209ae82921SPaul Mullowney 
12219ae82921SPaul Mullowney   PetscFunctionBegin;
12229ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
12239ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1224*ce814652SDominic Meiser     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
12259ae82921SPaul Mullowney     try {
1226aa372e3fSPaul Mullowney       cusparsestruct->nonzerorow=0;
1227aa372e3fSPaul Mullowney       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
12289ae82921SPaul Mullowney 
12299ae82921SPaul Mullowney       if (a->compressedrow.use) {
12309ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
12319ae82921SPaul Mullowney         ii   = a->compressedrow.i;
12329ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
12339ae82921SPaul Mullowney       } else {
1234b06137fdSPaul Mullowney         /* Forcing compressed row on the GPU */
12359ae82921SPaul Mullowney         int k=0;
1236785e854fSJed Brown         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1237785e854fSJed Brown         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
12389ae82921SPaul Mullowney         ii[0]=0;
12399ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
12409ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
12419ae82921SPaul Mullowney             ii[k]  = a->i[j];
12429ae82921SPaul Mullowney             ridx[k]= j;
12439ae82921SPaul Mullowney             k++;
12449ae82921SPaul Mullowney           }
12459ae82921SPaul Mullowney         }
1246aa372e3fSPaul Mullowney         ii[cusparsestruct->nonzerorow] = a->nz;
1247aa372e3fSPaul Mullowney         m = cusparsestruct->nonzerorow;
12489ae82921SPaul Mullowney       }
12499ae82921SPaul Mullowney 
1250aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
1251aa372e3fSPaul Mullowney       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1252aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1253aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1254aa372e3fSPaul Mullowney       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
12559ae82921SPaul Mullowney 
1256b06137fdSPaul Mullowney       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1257b06137fdSPaul Mullowney       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1258b06137fdSPaul Mullowney       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1259b06137fdSPaul Mullowney       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1260b06137fdSPaul Mullowney       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1261b06137fdSPaul Mullowney 
1262aa372e3fSPaul Mullowney       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1263aa372e3fSPaul Mullowney       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1264aa372e3fSPaul Mullowney         /* set the matrix */
1265aa372e3fSPaul Mullowney 	CsrMatrix *matrix= new CsrMatrix;
1266a65300a6SPaul Mullowney 	matrix->num_rows = m;
1267aa372e3fSPaul Mullowney 	matrix->num_cols = A->cmap->n;
1268aa372e3fSPaul Mullowney 	matrix->num_entries = a->nz;
1269a65300a6SPaul Mullowney 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1270a65300a6SPaul Mullowney 	matrix->row_offsets->assign(ii, ii + m+1);
12719ae82921SPaul Mullowney 
1272aa372e3fSPaul Mullowney 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1273aa372e3fSPaul Mullowney 	matrix->column_indices->assign(a->j, a->j+a->nz);
1274aa372e3fSPaul Mullowney 
1275aa372e3fSPaul Mullowney 	matrix->values = new THRUSTARRAY(a->nz);
1276aa372e3fSPaul Mullowney 	matrix->values->assign(a->a, a->a+a->nz);
1277aa372e3fSPaul Mullowney 
1278aa372e3fSPaul Mullowney         /* assign the pointer */
1279aa372e3fSPaul Mullowney         matstruct->mat = matrix;
1280aa372e3fSPaul Mullowney 
1281aa372e3fSPaul Mullowney       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
12822692e278SPaul Mullowney #if CUDA_VERSION>=4020
1283aa372e3fSPaul Mullowney 	CsrMatrix *matrix= new CsrMatrix;
1284a65300a6SPaul Mullowney 	matrix->num_rows = m;
1285aa372e3fSPaul Mullowney 	matrix->num_cols = A->cmap->n;
1286aa372e3fSPaul Mullowney 	matrix->num_entries = a->nz;
1287a65300a6SPaul Mullowney 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1288a65300a6SPaul Mullowney 	matrix->row_offsets->assign(ii, ii + m+1);
1289aa372e3fSPaul Mullowney 
1290aa372e3fSPaul Mullowney 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1291aa372e3fSPaul Mullowney 	matrix->column_indices->assign(a->j, a->j+a->nz);
1292aa372e3fSPaul Mullowney 
1293aa372e3fSPaul Mullowney 	matrix->values = new THRUSTARRAY(a->nz);
1294aa372e3fSPaul Mullowney 	matrix->values->assign(a->a, a->a+a->nz);
1295aa372e3fSPaul Mullowney 
1296aa372e3fSPaul Mullowney         cusparseHybMat_t hybMat;
1297aa372e3fSPaul Mullowney         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1298aa372e3fSPaul Mullowney         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1299aa372e3fSPaul Mullowney           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1300a65300a6SPaul Mullowney         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1301aa372e3fSPaul Mullowney                                 matstruct->descr, matrix->values->data().get(),
1302aa372e3fSPaul Mullowney                                 matrix->row_offsets->data().get(),
1303aa372e3fSPaul Mullowney                                 matrix->column_indices->data().get(),
1304aa372e3fSPaul Mullowney                                 hybMat, 0, partition);CHKERRCUSP(stat);
1305aa372e3fSPaul Mullowney         /* assign the pointer */
1306aa372e3fSPaul Mullowney         matstruct->mat = hybMat;
1307aa372e3fSPaul Mullowney 
1308aa372e3fSPaul Mullowney 	if (matrix) {
1309aa372e3fSPaul Mullowney 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1310aa372e3fSPaul Mullowney 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1311aa372e3fSPaul Mullowney 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1312aa372e3fSPaul Mullowney 	  delete (CsrMatrix*)matrix;
1313087f3262SPaul Mullowney 	}
13142692e278SPaul Mullowney #endif
1315087f3262SPaul Mullowney       }
1316ca45077fSPaul Mullowney 
1317aa372e3fSPaul Mullowney       /* assign the compressed row indices */
1318aa372e3fSPaul Mullowney       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1319aa372e3fSPaul Mullowney       matstruct->cprowIndices->assign(ridx,ridx+m);
1320aa372e3fSPaul Mullowney 
1321aa372e3fSPaul Mullowney       /* assign the pointer */
1322aa372e3fSPaul Mullowney       cusparsestruct->mat = matstruct;
1323aa372e3fSPaul Mullowney 
13249ae82921SPaul Mullowney       if (!a->compressedrow.use) {
13259ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
13269ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
13279ae82921SPaul Mullowney       }
1328aa372e3fSPaul Mullowney       cusparsestruct->workVector = new THRUSTARRAY;
1329aa372e3fSPaul Mullowney       cusparsestruct->workVector->resize(m);
13309ae82921SPaul Mullowney     } catch(char *ex) {
13319ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
13329ae82921SPaul Mullowney     }
13339ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
13342205254eSKarl Rupp 
13359ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
13362205254eSKarl Rupp 
13379ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
13389ae82921SPaul Mullowney   }
13399ae82921SPaul Mullowney   PetscFunctionReturn(0);
13409ae82921SPaul Mullowney }
13419ae82921SPaul Mullowney 
13429ae82921SPaul Mullowney #undef __FUNCT__
13432a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
13442a7a6963SBarry Smith static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
13459ae82921SPaul Mullowney {
13469ae82921SPaul Mullowney   PetscErrorCode ierr;
134733d57670SJed Brown   PetscInt rbs,cbs;
13489ae82921SPaul Mullowney 
13499ae82921SPaul Mullowney   PetscFunctionBegin;
135033d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
13519ae82921SPaul Mullowney   if (right) {
1352ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
13539ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
135433d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
13559ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
13569ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
13579ae82921SPaul Mullowney   }
13589ae82921SPaul Mullowney   if (left) {
1359ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
13609ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
136133d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
13629ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
13639ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
13649ae82921SPaul Mullowney   }
13659ae82921SPaul Mullowney   PetscFunctionReturn(0);
13669ae82921SPaul Mullowney }
13679ae82921SPaul Mullowney 
1368aa372e3fSPaul Mullowney struct VecCUSPPlusEquals
1369aa372e3fSPaul Mullowney {
1370aa372e3fSPaul Mullowney   template <typename Tuple>
1371aa372e3fSPaul Mullowney   __host__ __device__
1372aa372e3fSPaul Mullowney   void operator()(Tuple t)
1373aa372e3fSPaul Mullowney   {
1374aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1375aa372e3fSPaul Mullowney   }
1376aa372e3fSPaul Mullowney };
1377aa372e3fSPaul Mullowney 
13789ae82921SPaul Mullowney #undef __FUNCT__
13799ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
13806fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
13819ae82921SPaul Mullowney {
13829ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1383aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1384aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1385bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1386b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1387aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
13889ae82921SPaul Mullowney 
13899ae82921SPaul Mullowney   PetscFunctionBegin;
1390e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1391e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
13929ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
13939ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1394aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1395aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1396aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1397aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols, mat->num_entries,
1398b06137fdSPaul Mullowney                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1399b06137fdSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1400aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1401aa372e3fSPaul Mullowney   } else {
14022692e278SPaul Mullowney #if CUDA_VERSION>=4020
1403aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1404aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1405b06137fdSPaul Mullowney                              matstruct->alpha, matstruct->descr, hybMat,
1406b06137fdSPaul Mullowney                              xarray->data().get(), matstruct->beta,
1407aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
14082692e278SPaul Mullowney #endif
14099ae82921SPaul Mullowney   }
14109ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
14119ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1412aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
14139ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1414ca45077fSPaul Mullowney   }
1415aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
14169ae82921SPaul Mullowney   PetscFunctionReturn(0);
14179ae82921SPaul Mullowney }
14189ae82921SPaul Mullowney 
14199ae82921SPaul Mullowney #undef __FUNCT__
1420ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
14216fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1422ca45077fSPaul Mullowney {
1423ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1424aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1425aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1426bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1427b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1428aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1429ca45077fSPaul Mullowney 
1430ca45077fSPaul Mullowney   PetscFunctionBegin;
1431e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1432e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1433aa372e3fSPaul Mullowney   if (!matstructT) {
1434bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1435aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436bda325fcSPaul Mullowney   }
1437ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1438ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1439aa372e3fSPaul Mullowney 
1440aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1441aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1442aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1443aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols,
1444b06137fdSPaul Mullowney                              mat->num_entries, matstructT->alpha, matstructT->descr,
1445aa372e3fSPaul Mullowney                              mat->values->data().get(), mat->row_offsets->data().get(),
1446b06137fdSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1447aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1448aa372e3fSPaul Mullowney   } else {
14492692e278SPaul Mullowney #if CUDA_VERSION>=4020
1450aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1451aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1452b06137fdSPaul Mullowney                              matstructT->alpha, matstructT->descr, hybMat,
1453b06137fdSPaul Mullowney                              xarray->data().get(), matstructT->beta,
1454aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
14552692e278SPaul Mullowney #endif
1456ca45077fSPaul Mullowney   }
1457ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1458ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1459aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
1460ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1461ca45077fSPaul Mullowney   }
1462aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1463ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1464ca45077fSPaul Mullowney }
1465ca45077fSPaul Mullowney 
1466aa372e3fSPaul Mullowney 
1467ca45077fSPaul Mullowney #undef __FUNCT__
14689ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
14696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
14709ae82921SPaul Mullowney {
14719ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1472aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1473aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1474bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1475b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1476aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
14776e111a19SKarl Rupp 
14789ae82921SPaul Mullowney   PetscFunctionBegin;
1479e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1480e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
14819ae82921SPaul Mullowney   try {
14829ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
14839ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
14849ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
14859ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
14869ae82921SPaul Mullowney 
1487e057df02SPaul Mullowney     /* multiply add */
1488aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1489aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1490b06137fdSPaul Mullowney       /* here we need to be careful to set the number of rows in the multiply to the
1491b06137fdSPaul Mullowney 	 number of compressed rows in the matrix ... which is equivalent to the
1492b06137fdSPaul Mullowney 	 size of the workVector */
1493aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1494a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1495b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1496aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1497b06137fdSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1498aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1499aa372e3fSPaul Mullowney     } else {
15002692e278SPaul Mullowney #if CUDA_VERSION>=4020
1501aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1502a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1503aa372e3fSPaul Mullowney 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504b06137fdSPaul Mullowney 				 matstruct->alpha, matstruct->descr, hybMat,
1505b06137fdSPaul Mullowney 				 xarray->data().get(), matstruct->beta,
1506aa372e3fSPaul Mullowney 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1507a65300a6SPaul Mullowney       }
15082692e278SPaul Mullowney #endif
1509aa372e3fSPaul Mullowney     }
1510aa372e3fSPaul Mullowney 
1511aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1512aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1513aa372e3fSPaul Mullowney 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1514aa372e3fSPaul Mullowney 		     VecCUSPPlusEquals());
15159ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
15169ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
15179ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
15189ae82921SPaul Mullowney 
15199ae82921SPaul Mullowney   } catch(char *ex) {
15209ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
15219ae82921SPaul Mullowney   }
15229ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
15239ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
15249ae82921SPaul Mullowney   PetscFunctionReturn(0);
15259ae82921SPaul Mullowney }
15269ae82921SPaul Mullowney 
15279ae82921SPaul Mullowney #undef __FUNCT__
1528b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
15296fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1530ca45077fSPaul Mullowney {
1531ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1532aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1533aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1534ca45077fSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1535b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1536aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
15376e111a19SKarl Rupp 
1538ca45077fSPaul Mullowney   PetscFunctionBegin;
1539e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1540e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1541aa372e3fSPaul Mullowney   if (!matstructT) {
1542bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1543aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544bda325fcSPaul Mullowney   }
1545aa372e3fSPaul Mullowney 
1546ca45077fSPaul Mullowney   try {
1547ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1548ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1549ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1550ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1551ca45077fSPaul Mullowney 
1552e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1553aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1554aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1555b06137fdSPaul Mullowney       /* here we need to be careful to set the number of rows in the multiply to the
1556b06137fdSPaul Mullowney 	 number of compressed rows in the matrix ... which is equivalent to the
1557b06137fdSPaul Mullowney 	 size of the workVector */
1558aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1559a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1560b06137fdSPaul Mullowney 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1561aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1562b06137fdSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1563aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1564aa372e3fSPaul Mullowney     } else {
15652692e278SPaul Mullowney #if CUDA_VERSION>=4020
1566aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1567a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1568aa372e3fSPaul Mullowney 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1569b06137fdSPaul Mullowney 				 matstructT->alpha, matstructT->descr, hybMat,
1570b06137fdSPaul Mullowney 				 xarray->data().get(), matstructT->beta,
1571aa372e3fSPaul Mullowney 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1572a65300a6SPaul Mullowney       }
15732692e278SPaul Mullowney #endif
1574aa372e3fSPaul Mullowney     }
1575aa372e3fSPaul Mullowney 
1576aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1577aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1578aa372e3fSPaul Mullowney 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1579aa372e3fSPaul Mullowney 		     VecCUSPPlusEquals());
1580ca45077fSPaul Mullowney 
1581ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1582ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1583ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1584ca45077fSPaul Mullowney 
1585ca45077fSPaul Mullowney   } catch(char *ex) {
1586ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1587ca45077fSPaul Mullowney   }
1588ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1589ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1590ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1591ca45077fSPaul Mullowney }
1592ca45077fSPaul Mullowney 
1593ca45077fSPaul Mullowney #undef __FUNCT__
15949ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
15956fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
15969ae82921SPaul Mullowney {
15979ae82921SPaul Mullowney   PetscErrorCode ierr;
15986e111a19SKarl Rupp 
15999ae82921SPaul Mullowney   PetscFunctionBegin;
16009ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1601bc3f50f2SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1602e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1603bc3f50f2SPaul Mullowney   }
16049ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1605bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1606bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1607bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1608bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
16099ae82921SPaul Mullowney   PetscFunctionReturn(0);
16109ae82921SPaul Mullowney }
16119ae82921SPaul Mullowney 
16129ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
16139ae82921SPaul Mullowney #undef __FUNCT__
16149ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1615e057df02SPaul Mullowney /*@
16169ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1617e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1618e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1619e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1620e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1621e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
16229ae82921SPaul Mullowney 
16239ae82921SPaul Mullowney    Collective on MPI_Comm
16249ae82921SPaul Mullowney 
16259ae82921SPaul Mullowney    Input Parameters:
16269ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
16279ae82921SPaul Mullowney .  m - number of rows
16289ae82921SPaul Mullowney .  n - number of columns
16299ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
16309ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
16310298fd71SBarry Smith          (possibly different for each row) or NULL
16329ae82921SPaul Mullowney 
16339ae82921SPaul Mullowney    Output Parameter:
16349ae82921SPaul Mullowney .  A - the matrix
16359ae82921SPaul Mullowney 
16369ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
16379ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
16389ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
16399ae82921SPaul Mullowney 
16409ae82921SPaul Mullowney    Notes:
16419ae82921SPaul Mullowney    If nnz is given then nz is ignored
16429ae82921SPaul Mullowney 
16439ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
16449ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
16459ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
16469ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
16479ae82921SPaul Mullowney 
16489ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
16490298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
16509ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
16519ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
16529ae82921SPaul Mullowney 
16539ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
16549ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
16559ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
16569ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
16579ae82921SPaul Mullowney 
16589ae82921SPaul Mullowney    Level: intermediate
16599ae82921SPaul Mullowney 
1660e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
16619ae82921SPaul Mullowney @*/
16629ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
16639ae82921SPaul Mullowney {
16649ae82921SPaul Mullowney   PetscErrorCode ierr;
16659ae82921SPaul Mullowney 
16669ae82921SPaul Mullowney   PetscFunctionBegin;
16679ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
16689ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
16699ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
16709ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
16719ae82921SPaul Mullowney   PetscFunctionReturn(0);
16729ae82921SPaul Mullowney }
16739ae82921SPaul Mullowney 
16749ae82921SPaul Mullowney #undef __FUNCT__
16759ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
16766fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
16779ae82921SPaul Mullowney {
16789ae82921SPaul Mullowney   PetscErrorCode   ierr;
1679ab25e6cbSDominic Meiser 
16809ae82921SPaul Mullowney   PetscFunctionBegin;
16819ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
16829ae82921SPaul Mullowney     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1683ab25e6cbSDominic Meiser       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
16849ae82921SPaul Mullowney     }
16859ae82921SPaul Mullowney   } else {
1686ab25e6cbSDominic Meiser     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1687aa372e3fSPaul Mullowney   }
16889ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
16899ae82921SPaul Mullowney   PetscFunctionReturn(0);
16909ae82921SPaul Mullowney }
16919ae82921SPaul Mullowney 
16929ae82921SPaul Mullowney #undef __FUNCT__
16939ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
16948cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
16959ae82921SPaul Mullowney {
16969ae82921SPaul Mullowney   PetscErrorCode ierr;
1697aa372e3fSPaul Mullowney   cusparseStatus_t stat;
1698aa372e3fSPaul Mullowney   cusparseHandle_t handle=0;
16999ae82921SPaul Mullowney 
17009ae82921SPaul Mullowney   PetscFunctionBegin;
17019ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
17029ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1703e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1704e057df02SPaul Mullowney        now build a GPU matrix data structure */
17059ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
17069ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1707aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1708aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1709e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1710aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1711aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1712aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1713aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1714aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
17159ae82921SPaul Mullowney   } else {
17169ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1717debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
17189ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
17199ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1720aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1721aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1722aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1723aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1724aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1725aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1726aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1727aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1728aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
17299ae82921SPaul Mullowney   }
1730aa372e3fSPaul Mullowney 
1731e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1732e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
1733e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1734bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
17352205254eSKarl Rupp 
17369ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
17379ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
17382a7a6963SBarry Smith   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
17399ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1740ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1741ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1742ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1743ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
17442205254eSKarl Rupp 
17459ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
17462205254eSKarl Rupp 
17479ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
17482205254eSKarl Rupp 
1749bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
17509ae82921SPaul Mullowney   PetscFunctionReturn(0);
17519ae82921SPaul Mullowney }
17529ae82921SPaul Mullowney 
1753e057df02SPaul Mullowney /*M
1754e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1755e057df02SPaul Mullowney 
1756e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
17572692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
17582692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1759e057df02SPaul Mullowney 
1760e057df02SPaul Mullowney    Options Database Keys:
1761e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1762aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1763aa372e3fSPaul Mullowney .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1764e057df02SPaul Mullowney 
1765e057df02SPaul Mullowney   Level: beginner
1766e057df02SPaul Mullowney 
17678468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1768e057df02SPaul Mullowney M*/
17697f756511SDominic Meiser 
17707f756511SDominic Meiser #undef __FUNCT__
17717f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
17727f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
17737f756511SDominic Meiser {
17747f756511SDominic Meiser   cusparseStatus_t stat;
17757f756511SDominic Meiser   cusparseHandle_t handle;
17767f756511SDominic Meiser 
17777f756511SDominic Meiser   PetscFunctionBegin;
17787f756511SDominic Meiser   if (*cusparsestruct) {
17797f756511SDominic Meiser     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
17807f756511SDominic Meiser     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
17817f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
17827f756511SDominic Meiser     if (handle = (*cusparsestruct)->handle) {
17837f756511SDominic Meiser       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
17847f756511SDominic Meiser     }
17857f756511SDominic Meiser     delete *cusparsestruct;
17867f756511SDominic Meiser     *cusparsestruct = 0;
17877f756511SDominic Meiser   }
17887f756511SDominic Meiser   PetscFunctionReturn(0);
17897f756511SDominic Meiser }
17907f756511SDominic Meiser 
17917f756511SDominic Meiser #undef __FUNCT__
17927f756511SDominic Meiser #define __FUNCT__ "CsrMatrix_Destroy"
17937f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
17947f756511SDominic Meiser {
17957f756511SDominic Meiser   PetscFunctionBegin;
17967f756511SDominic Meiser   if (*mat) {
17977f756511SDominic Meiser     delete (*mat)->values;
17987f756511SDominic Meiser     delete (*mat)->column_indices;
17997f756511SDominic Meiser     delete (*mat)->row_offsets;
18007f756511SDominic Meiser     delete *mat;
18017f756511SDominic Meiser     *mat = 0;
18027f756511SDominic Meiser   }
18037f756511SDominic Meiser   PetscFunctionReturn(0);
18047f756511SDominic Meiser }
18057f756511SDominic Meiser 
18067f756511SDominic Meiser #undef __FUNCT__
18077f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
18087f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
18097f756511SDominic Meiser {
18107f756511SDominic Meiser   cusparseStatus_t stat;
18117f756511SDominic Meiser   PetscErrorCode   ierr;
18127f756511SDominic Meiser 
18137f756511SDominic Meiser   PetscFunctionBegin;
18147f756511SDominic Meiser   if (*trifactor) {
18157f756511SDominic Meiser     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
18167f756511SDominic Meiser     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
18177f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
18187f756511SDominic Meiser     delete *trifactor;
18197f756511SDominic Meiser     *trifactor = 0;
18207f756511SDominic Meiser   }
18217f756511SDominic Meiser   PetscFunctionReturn(0);
18227f756511SDominic Meiser }
18237f756511SDominic Meiser 
18247f756511SDominic Meiser #undef __FUNCT__
18257f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
18267f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
18277f756511SDominic Meiser {
18287f756511SDominic Meiser   CsrMatrix        *mat;
18297f756511SDominic Meiser   cusparseStatus_t stat;
18307f756511SDominic Meiser   cudaError_t      err;
18317f756511SDominic Meiser 
18327f756511SDominic Meiser   PetscFunctionBegin;
18337f756511SDominic Meiser   if (*matstruct) {
18347f756511SDominic Meiser     if ((*matstruct)->mat) {
18357f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
18367f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
18377f756511SDominic Meiser         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
18387f756511SDominic Meiser       } else {
18397f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
18407f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
18417f756511SDominic Meiser       }
18427f756511SDominic Meiser     }
18437f756511SDominic Meiser     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
18447f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
18457f756511SDominic Meiser     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
18467f756511SDominic Meiser     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
18477f756511SDominic Meiser     delete *matstruct;
18487f756511SDominic Meiser     *matstruct = 0;
18497f756511SDominic Meiser   }
18507f756511SDominic Meiser   PetscFunctionReturn(0);
18517f756511SDominic Meiser }
18527f756511SDominic Meiser 
18537f756511SDominic Meiser #undef __FUNCT__
18547f756511SDominic Meiser #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
18557f756511SDominic Meiser static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
18567f756511SDominic Meiser {
18577f756511SDominic Meiser   cusparseHandle_t handle;
18587f756511SDominic Meiser   cusparseStatus_t stat;
18597f756511SDominic Meiser 
18607f756511SDominic Meiser   PetscFunctionBegin;
18617f756511SDominic Meiser   if (*trifactors) {
18627f756511SDominic Meiser     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
18637f756511SDominic Meiser     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
18647f756511SDominic Meiser     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
18657f756511SDominic Meiser     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
18667f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
18677f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
18687f756511SDominic Meiser     delete (*trifactors)->workVector;
18697f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
18707f756511SDominic Meiser       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
18717f756511SDominic Meiser     }
18727f756511SDominic Meiser     delete *trifactors;
18737f756511SDominic Meiser     *trifactors = 0;
18747f756511SDominic Meiser   }
18757f756511SDominic Meiser   PetscFunctionReturn(0);
18767f756511SDominic Meiser }
18777f756511SDominic Meiser 
1878