xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a183c035f1d63567095ad55d51f6b3f2111532ac)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3bc3f50f2SPaul Mullowney   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
59ae82921SPaul Mullowney 
63d13b8fdSMatthew G. Knepley #include <petscconf.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
93d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
103d13b8fdSMatthew G. Knepley #include <petsc-private/vecimpl.h>
119ae82921SPaul Mullowney #undef VecType
123d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13bc3f50f2SPaul Mullowney 
14e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
159ae82921SPaul Mullowney 
16087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19087f3262SPaul Mullowney 
206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
216fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
226fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23087f3262SPaul Mullowney 
246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
266fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
286fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
296fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
306fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
316fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
326fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
339ae82921SPaul Mullowney 
349ae82921SPaul Mullowney #undef __FUNCT__
35b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetStream"
36b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37b06137fdSPaul Mullowney {
38b06137fdSPaul Mullowney   cusparseStatus_t   stat;
39b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40b06137fdSPaul Mullowney 
41b06137fdSPaul Mullowney   PetscFunctionBegin;
42b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
43b06137fdSPaul Mullowney   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44b06137fdSPaul Mullowney   PetscFunctionReturn(0);
45b06137fdSPaul Mullowney }
46b06137fdSPaul Mullowney 
47b06137fdSPaul Mullowney #undef __FUNCT__
48b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSESetHandle"
49b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50b06137fdSPaul Mullowney {
51b06137fdSPaul Mullowney   cusparseStatus_t   stat;
52b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53b06137fdSPaul Mullowney 
54b06137fdSPaul Mullowney   PetscFunctionBegin;
55b06137fdSPaul Mullowney   if (cusparsestruct->handle)
56b06137fdSPaul Mullowney     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57b06137fdSPaul Mullowney   cusparsestruct->handle = handle;
58b06137fdSPaul Mullowney   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59b06137fdSPaul Mullowney   PetscFunctionReturn(0);
60b06137fdSPaul Mullowney }
61b06137fdSPaul Mullowney 
62b06137fdSPaul Mullowney #undef __FUNCT__
63b06137fdSPaul Mullowney #define __FUNCT__ "MatCUSPARSEClearHandle"
64b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65b06137fdSPaul Mullowney {
66b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67b06137fdSPaul Mullowney   PetscFunctionBegin;
68b06137fdSPaul Mullowney   if (cusparsestruct->handle)
69b06137fdSPaul Mullowney     cusparsestruct->handle = 0;
70b06137fdSPaul Mullowney   PetscFunctionReturn(0);
71b06137fdSPaul Mullowney }
72b06137fdSPaul Mullowney 
73b06137fdSPaul Mullowney #undef __FUNCT__
749ae82921SPaul Mullowney #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
759ae82921SPaul Mullowney PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
769ae82921SPaul Mullowney {
779ae82921SPaul Mullowney   PetscFunctionBegin;
789ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
799ae82921SPaul Mullowney   PetscFunctionReturn(0);
809ae82921SPaul Mullowney }
819ae82921SPaul Mullowney 
82c708e6cdSJed Brown /*MC
83087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
89c708e6cdSJed Brown 
909ae82921SPaul Mullowney   Level: beginner
91c708e6cdSJed Brown 
92c708e6cdSJed Brown .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93c708e6cdSJed Brown M*/
949ae82921SPaul Mullowney 
959ae82921SPaul Mullowney #undef __FUNCT__
969ae82921SPaul Mullowney #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
973c3a0fd4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
989ae82921SPaul Mullowney {
999ae82921SPaul Mullowney   PetscErrorCode ierr;
100bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1019ae82921SPaul Mullowney 
1029ae82921SPaul Mullowney   PetscFunctionBegin;
103bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104404133a2SPaul Mullowney   (*B)->factortype = ftype;
105bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1069ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1072205254eSKarl Rupp 
108087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
10933d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1109ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1119ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1159ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116bc3f50f2SPaul Mullowney 
117fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
11862a20339SJed Brown   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
1199ae82921SPaul Mullowney   PetscFunctionReturn(0);
1209ae82921SPaul Mullowney }
1219ae82921SPaul Mullowney 
1229ae82921SPaul Mullowney #undef __FUNCT__
123e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125ca45077fSPaul Mullowney {
126aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1276e111a19SKarl Rupp 
128ca45077fSPaul Mullowney   PetscFunctionBegin;
1292692e278SPaul Mullowney #if CUDA_VERSION>=4020
130ca45077fSPaul Mullowney   switch (op) {
131e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
132aa372e3fSPaul Mullowney     cusparsestruct->format = format;
133ca45077fSPaul Mullowney     break;
134e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
135aa372e3fSPaul Mullowney     cusparsestruct->format = format;
136ca45077fSPaul Mullowney     break;
137ca45077fSPaul Mullowney   default:
13836d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139ca45077fSPaul Mullowney   }
1402692e278SPaul Mullowney #else
1412692e278SPaul Mullowney   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
1422692e278SPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
1432692e278SPaul Mullowney #endif
144ca45077fSPaul Mullowney   PetscFunctionReturn(0);
145ca45077fSPaul Mullowney }
1469ae82921SPaul Mullowney 
147e057df02SPaul Mullowney /*@
148e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
150aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
151e057df02SPaul Mullowney    Not Collective
152e057df02SPaul Mullowney 
153e057df02SPaul Mullowney    Input Parameters:
1548468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
15536d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1562692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157e057df02SPaul Mullowney 
158e057df02SPaul Mullowney    Output Parameter:
159e057df02SPaul Mullowney 
160e057df02SPaul Mullowney    Level: intermediate
161e057df02SPaul Mullowney 
1628468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163e057df02SPaul Mullowney @*/
164e057df02SPaul Mullowney #undef __FUNCT__
165e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat"
166e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167e057df02SPaul Mullowney {
168e057df02SPaul Mullowney   PetscErrorCode ierr;
1696e111a19SKarl Rupp 
170e057df02SPaul Mullowney   PetscFunctionBegin;
171e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172e057df02SPaul Mullowney   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173e057df02SPaul Mullowney   PetscFunctionReturn(0);
174e057df02SPaul Mullowney }
175e057df02SPaul Mullowney 
1769ae82921SPaul Mullowney #undef __FUNCT__
1779ae82921SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
1786fa9248bSJed Brown static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
1799ae82921SPaul Mullowney {
1809ae82921SPaul Mullowney   PetscErrorCode           ierr;
181e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1829ae82921SPaul Mullowney   PetscBool                flg;
183*a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1846e111a19SKarl Rupp 
1859ae82921SPaul Mullowney   PetscFunctionBegin;
186e057df02SPaul Mullowney   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
187e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
1889ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
189e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
190*a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
191e057df02SPaul Mullowney     if (flg) {
192e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
193045c96e1SPaul Mullowney     }
1949ae82921SPaul Mullowney   }
1954c87dfd4SPaul Mullowney   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
196*a183c035SDominic Meiser                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
1974c87dfd4SPaul Mullowney   if (flg) {
1984c87dfd4SPaul Mullowney     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
1994c87dfd4SPaul Mullowney   }
2009ae82921SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2019ae82921SPaul Mullowney   PetscFunctionReturn(0);
2029ae82921SPaul Mullowney 
2039ae82921SPaul Mullowney }
2049ae82921SPaul Mullowney 
2059ae82921SPaul Mullowney #undef __FUNCT__
2069ae82921SPaul Mullowney #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
2076fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2089ae82921SPaul Mullowney {
2099ae82921SPaul Mullowney   PetscErrorCode ierr;
2109ae82921SPaul Mullowney 
2119ae82921SPaul Mullowney   PetscFunctionBegin;
2129ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2139ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2149ae82921SPaul Mullowney   PetscFunctionReturn(0);
2159ae82921SPaul Mullowney }
2169ae82921SPaul Mullowney 
2179ae82921SPaul Mullowney #undef __FUNCT__
2189ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
2196fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2209ae82921SPaul Mullowney {
2219ae82921SPaul Mullowney   PetscErrorCode ierr;
2229ae82921SPaul Mullowney 
2239ae82921SPaul Mullowney   PetscFunctionBegin;
2249ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2259ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2269ae82921SPaul Mullowney   PetscFunctionReturn(0);
2279ae82921SPaul Mullowney }
2289ae82921SPaul Mullowney 
2299ae82921SPaul Mullowney #undef __FUNCT__
230087f3262SPaul Mullowney #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
231087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
232087f3262SPaul Mullowney {
233087f3262SPaul Mullowney   PetscErrorCode ierr;
234087f3262SPaul Mullowney 
235087f3262SPaul Mullowney   PetscFunctionBegin;
236087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
237087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
238087f3262SPaul Mullowney   PetscFunctionReturn(0);
239087f3262SPaul Mullowney }
240087f3262SPaul Mullowney 
241087f3262SPaul Mullowney #undef __FUNCT__
242087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
243087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
244087f3262SPaul Mullowney {
245087f3262SPaul Mullowney   PetscErrorCode ierr;
246087f3262SPaul Mullowney 
247087f3262SPaul Mullowney   PetscFunctionBegin;
248087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
249087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
250087f3262SPaul Mullowney   PetscFunctionReturn(0);
251087f3262SPaul Mullowney }
252087f3262SPaul Mullowney 
253087f3262SPaul Mullowney #undef __FUNCT__
254087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
255087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2569ae82921SPaul Mullowney {
2579ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2589ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2599ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
260aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2619ae82921SPaul Mullowney   cusparseStatus_t                  stat;
2629ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2639ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2649ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2659ae82921SPaul Mullowney   PetscScalar                       *AALo;
2669ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
267b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
2689ae82921SPaul Mullowney 
2699ae82921SPaul Mullowney   PetscFunctionBegin;
2709ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
2719ae82921SPaul Mullowney     try {
2729ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2739ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
2749ae82921SPaul Mullowney 
2759ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
2769ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
2779ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
2789ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
2799ae82921SPaul Mullowney 
2809ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
2819ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
2829ae82921SPaul Mullowney       AiLo[n]  = nzLower;
2839ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
2849ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
2859ae82921SPaul Mullowney       v        = aa;
2869ae82921SPaul Mullowney       vi       = aj;
2879ae82921SPaul Mullowney       offset   = 1;
2889ae82921SPaul Mullowney       rowOffset= 1;
2899ae82921SPaul Mullowney       for (i=1; i<n; i++) {
2909ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
291e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
2929ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
2939ae82921SPaul Mullowney         rowOffset += nz+1;
2949ae82921SPaul Mullowney 
2959ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
2969ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
2979ae82921SPaul Mullowney 
2989ae82921SPaul Mullowney         offset      += nz;
2999ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3009ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3019ae82921SPaul Mullowney         offset      += 1;
3029ae82921SPaul Mullowney 
3039ae82921SPaul Mullowney         v  += nz;
3049ae82921SPaul Mullowney         vi += nz;
3059ae82921SPaul Mullowney       }
3062205254eSKarl Rupp 
307aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
308aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3092205254eSKarl Rupp 
310aa372e3fSPaul Mullowney       /* Create the matrix description */
311aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
312aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
313aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
314aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
315aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
316aa372e3fSPaul Mullowney 
317aa372e3fSPaul Mullowney       /* Create the solve analysis information */
318aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
319aa372e3fSPaul Mullowney 
320aa372e3fSPaul Mullowney       /* set the operation */
321aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
322aa372e3fSPaul Mullowney 
323aa372e3fSPaul Mullowney       /* set the matrix */
324aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
325aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
326aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
327aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
328aa372e3fSPaul Mullowney 
329aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
330aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
331aa372e3fSPaul Mullowney 
332aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
333aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
334aa372e3fSPaul Mullowney 
335aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
336aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
337aa372e3fSPaul Mullowney 
338aa372e3fSPaul Mullowney       /* perform the solve analysis */
339aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
340aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
341aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
342aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
343aa372e3fSPaul Mullowney 
344aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
345aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3462205254eSKarl Rupp 
3479ae82921SPaul Mullowney       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
3489ae82921SPaul Mullowney       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
3499ae82921SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
3509ae82921SPaul Mullowney     } catch(char *ex) {
3519ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3529ae82921SPaul Mullowney     }
3539ae82921SPaul Mullowney   }
3549ae82921SPaul Mullowney   PetscFunctionReturn(0);
3559ae82921SPaul Mullowney }
3569ae82921SPaul Mullowney 
3579ae82921SPaul Mullowney #undef __FUNCT__
358087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
359087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
3609ae82921SPaul Mullowney {
3619ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3629ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
364aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
3659ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3669ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
3679ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3689ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3699ae82921SPaul Mullowney   PetscScalar                       *AAUp;
3709ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
3719ae82921SPaul Mullowney   PetscErrorCode                    ierr;
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney   PetscFunctionBegin;
3749ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
3759ae82921SPaul Mullowney     try {
3769ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3779ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
3789ae82921SPaul Mullowney 
3799ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
3809ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
3819ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
3829ae82921SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
3859ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
3869ae82921SPaul Mullowney       AiUp[n]=nzUpper;
3879ae82921SPaul Mullowney       offset = nzUpper;
3889ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
3899ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
3909ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
3919ae82921SPaul Mullowney 
392e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
3939ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
3949ae82921SPaul Mullowney 
395e057df02SPaul Mullowney         /* decrement the offset */
3969ae82921SPaul Mullowney         offset -= (nz+1);
3979ae82921SPaul Mullowney 
398e057df02SPaul Mullowney         /* first, set the diagonal elements */
3999ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
4009ae82921SPaul Mullowney         AAUp[offset] = 1./v[nz];
4019ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4029ae82921SPaul Mullowney 
4039ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
4049ae82921SPaul Mullowney         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
4059ae82921SPaul Mullowney       }
4062205254eSKarl Rupp 
407aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
408aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4092205254eSKarl Rupp 
410aa372e3fSPaul Mullowney       /* Create the matrix description */
411aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
412aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
413aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
414aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
415aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
416aa372e3fSPaul Mullowney 
417aa372e3fSPaul Mullowney       /* Create the solve analysis information */
418aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
419aa372e3fSPaul Mullowney 
420aa372e3fSPaul Mullowney       /* set the operation */
421aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422aa372e3fSPaul Mullowney 
423aa372e3fSPaul Mullowney       /* set the matrix */
424aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
425aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
426aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
427aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
428aa372e3fSPaul Mullowney 
429aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
431aa372e3fSPaul Mullowney 
432aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
433aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
434aa372e3fSPaul Mullowney 
435aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
436aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
437aa372e3fSPaul Mullowney 
438aa372e3fSPaul Mullowney       /* perform the solve analysis */
439aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
440aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
441aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
442aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
443aa372e3fSPaul Mullowney 
444aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
445aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
4462205254eSKarl Rupp 
4479ae82921SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
4489ae82921SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
4499ae82921SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
4509ae82921SPaul Mullowney     } catch(char *ex) {
4519ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4529ae82921SPaul Mullowney     }
4539ae82921SPaul Mullowney   }
4549ae82921SPaul Mullowney   PetscFunctionReturn(0);
4559ae82921SPaul Mullowney }
4569ae82921SPaul Mullowney 
4579ae82921SPaul Mullowney #undef __FUNCT__
458087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
459087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
4609ae82921SPaul Mullowney {
4619ae82921SPaul Mullowney   PetscErrorCode               ierr;
4629ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
4639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4649ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
4659ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
4669ae82921SPaul Mullowney   const PetscInt               *r,*c;
4679ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney   PetscFunctionBegin;
470087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
471087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
4722205254eSKarl Rupp 
473aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
474aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
475aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
4769ae82921SPaul Mullowney 
4779ae82921SPaul Mullowney   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
478e057df02SPaul Mullowney   /*lower triangular indices */
4799ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4809ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4812205254eSKarl Rupp   if (!row_identity) {
482aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
483aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
4842205254eSKarl Rupp   }
4859ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4869ae82921SPaul Mullowney 
487e057df02SPaul Mullowney   /*upper triangular indices */
4889ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
4899ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
4902205254eSKarl Rupp   if (!col_identity) {
491aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
492aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
4932205254eSKarl Rupp   }
4949ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
4959ae82921SPaul Mullowney   PetscFunctionReturn(0);
4969ae82921SPaul Mullowney }
4979ae82921SPaul Mullowney 
4989ae82921SPaul Mullowney #undef __FUNCT__
499087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
500087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
501087f3262SPaul Mullowney {
502087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
503087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
504aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
505aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
506087f3262SPaul Mullowney   cusparseStatus_t                  stat;
507087f3262SPaul Mullowney   PetscErrorCode                    ierr;
508087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
509087f3262SPaul Mullowney   PetscScalar                       *AAUp;
510087f3262SPaul Mullowney   PetscScalar                       *AALo;
511087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
512087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
513087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
514087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
515087f3262SPaul Mullowney 
516087f3262SPaul Mullowney   PetscFunctionBegin;
517087f3262SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
518087f3262SPaul Mullowney     try {
519087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
520087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
521087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
522087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523087f3262SPaul Mullowney       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
524087f3262SPaul Mullowney 
525087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
526087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
527087f3262SPaul Mullowney       AiUp[n]=nzUpper;
528087f3262SPaul Mullowney       offset = 0;
529087f3262SPaul Mullowney       for (i=0; i<n; i++) {
530087f3262SPaul Mullowney         /* set the pointers */
531087f3262SPaul Mullowney         v  = aa + ai[i];
532087f3262SPaul Mullowney         vj = aj + ai[i];
533087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
534087f3262SPaul Mullowney 
535087f3262SPaul Mullowney         /* first, set the diagonal elements */
536087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
537087f3262SPaul Mullowney         AAUp[offset] = 1.0/v[nz];
538087f3262SPaul Mullowney         AiUp[i]      = offset;
539087f3262SPaul Mullowney         AALo[offset] = 1.0/v[nz];
540087f3262SPaul Mullowney 
541087f3262SPaul Mullowney         offset+=1;
542087f3262SPaul Mullowney         if (nz>0) {
543087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
544087f3262SPaul Mullowney           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
545087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
546087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
547087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
548087f3262SPaul Mullowney           }
549087f3262SPaul Mullowney           offset+=nz;
550087f3262SPaul Mullowney         }
551087f3262SPaul Mullowney       }
552087f3262SPaul Mullowney 
553aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
554aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
555087f3262SPaul Mullowney 
556aa372e3fSPaul Mullowney       /* Create the matrix description */
557aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
558aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
559aa372e3fSPaul Mullowney       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
560aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
561aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
562087f3262SPaul Mullowney 
563aa372e3fSPaul Mullowney       /* Create the solve analysis information */
564aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
565aa372e3fSPaul Mullowney 
566aa372e3fSPaul Mullowney       /* set the operation */
567aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
568aa372e3fSPaul Mullowney 
569aa372e3fSPaul Mullowney       /* set the matrix */
570aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
571aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
572aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
573aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
574aa372e3fSPaul Mullowney 
575aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
576aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
577aa372e3fSPaul Mullowney 
578aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
579aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
580aa372e3fSPaul Mullowney 
581aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
582aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
583aa372e3fSPaul Mullowney 
584aa372e3fSPaul Mullowney       /* perform the solve analysis */
585aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
586aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
587aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
588aa372e3fSPaul Mullowney                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
591aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
594aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
595aa372e3fSPaul Mullowney 
596aa372e3fSPaul Mullowney       /* Create the matrix description */
597aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
598aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
599aa372e3fSPaul Mullowney       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
600aa372e3fSPaul Mullowney       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
601aa372e3fSPaul Mullowney       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
602aa372e3fSPaul Mullowney 
603aa372e3fSPaul Mullowney       /* Create the solve analysis information */
604aa372e3fSPaul Mullowney       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
605aa372e3fSPaul Mullowney 
606aa372e3fSPaul Mullowney       /* set the operation */
607aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
608aa372e3fSPaul Mullowney 
609aa372e3fSPaul Mullowney       /* set the matrix */
610aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
611aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
612aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
613aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
614aa372e3fSPaul Mullowney 
615aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
616aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
617aa372e3fSPaul Mullowney 
618aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
619aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
620aa372e3fSPaul Mullowney 
621aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
622aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
623aa372e3fSPaul Mullowney 
624aa372e3fSPaul Mullowney       /* perform the solve analysis */
625aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
626aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
627aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
628aa372e3fSPaul Mullowney                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
629aa372e3fSPaul Mullowney 
630aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
631aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
632087f3262SPaul Mullowney 
633087f3262SPaul Mullowney       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
634087f3262SPaul Mullowney       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
635087f3262SPaul Mullowney       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
636087f3262SPaul Mullowney       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
637087f3262SPaul Mullowney       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
638087f3262SPaul Mullowney     } catch(char *ex) {
639087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
640087f3262SPaul Mullowney     }
641087f3262SPaul Mullowney   }
642087f3262SPaul Mullowney   PetscFunctionReturn(0);
643087f3262SPaul Mullowney }
644087f3262SPaul Mullowney 
645087f3262SPaul Mullowney #undef __FUNCT__
646087f3262SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
647087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
6489ae82921SPaul Mullowney {
6499ae82921SPaul Mullowney   PetscErrorCode               ierr;
650087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
651087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
652087f3262SPaul Mullowney   IS                           ip = a->row;
653087f3262SPaul Mullowney   const PetscInt               *rip;
654087f3262SPaul Mullowney   PetscBool                    perm_identity;
655087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
656087f3262SPaul Mullowney 
657087f3262SPaul Mullowney   PetscFunctionBegin;
658087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
659aa372e3fSPaul Mullowney   cusparseTriFactors->workVector = new THRUSTARRAY;
660aa372e3fSPaul Mullowney   cusparseTriFactors->workVector->resize(n);
661aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
662aa372e3fSPaul Mullowney 
663087f3262SPaul Mullowney   /*lower triangular indices */
664087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
665087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
666087f3262SPaul Mullowney   if (!perm_identity) {
667aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
668aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
669aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
671087f3262SPaul Mullowney   }
672087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
673087f3262SPaul Mullowney   PetscFunctionReturn(0);
674087f3262SPaul Mullowney }
675087f3262SPaul Mullowney 
676087f3262SPaul Mullowney #undef __FUNCT__
6779ae82921SPaul Mullowney #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
6786fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
6799ae82921SPaul Mullowney {
6809ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
6819ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
6829ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
683b175d8bbSPaul Mullowney   PetscErrorCode ierr;
6849ae82921SPaul Mullowney 
6859ae82921SPaul Mullowney   PetscFunctionBegin;
6869ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
687e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
6889ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6899ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690bda325fcSPaul Mullowney   if (row_identity && col_identity) {
691bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
692bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
693bda325fcSPaul Mullowney   } else {
694bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
695bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
696bda325fcSPaul Mullowney   }
6978dc1d2a3SPaul Mullowney 
698e057df02SPaul Mullowney   /* get the triangular factors */
699087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
7009ae82921SPaul Mullowney   PetscFunctionReturn(0);
7019ae82921SPaul Mullowney }
7029ae82921SPaul Mullowney 
703087f3262SPaul Mullowney #undef __FUNCT__
704087f3262SPaul Mullowney #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
705087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
706087f3262SPaul Mullowney {
707087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
708087f3262SPaul Mullowney   IS             ip = b->row;
709087f3262SPaul Mullowney   PetscBool      perm_identity;
710b175d8bbSPaul Mullowney   PetscErrorCode ierr;
711087f3262SPaul Mullowney 
712087f3262SPaul Mullowney   PetscFunctionBegin;
713087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
714087f3262SPaul Mullowney 
715087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
716087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
717087f3262SPaul Mullowney   if (perm_identity) {
718087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
719087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
720087f3262SPaul Mullowney   } else {
721087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
722087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
723087f3262SPaul Mullowney   }
724087f3262SPaul Mullowney 
725087f3262SPaul Mullowney   /* get the triangular factors */
726087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727087f3262SPaul Mullowney   PetscFunctionReturn(0);
728087f3262SPaul Mullowney }
7299ae82921SPaul Mullowney 
730bda325fcSPaul Mullowney #undef __FUNCT__
731bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
732b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
733bda325fcSPaul Mullowney {
734bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
735aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
736aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
737aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
738aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
739bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
740aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
741aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
742aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
743aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
744b175d8bbSPaul Mullowney 
745bda325fcSPaul Mullowney   PetscFunctionBegin;
746bda325fcSPaul Mullowney 
747aa372e3fSPaul Mullowney   /*********************************************/
748aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
749aa372e3fSPaul Mullowney   /*********************************************/
750aa372e3fSPaul Mullowney 
751aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
752aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
753aa372e3fSPaul Mullowney 
754aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
755aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
756aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
757aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
758aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
759aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
760aa372e3fSPaul Mullowney 
761aa372e3fSPaul Mullowney   /* Create the matrix description */
762aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
763aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
764aa372e3fSPaul Mullowney   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
765aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
766aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
767aa372e3fSPaul Mullowney 
768aa372e3fSPaul Mullowney   /* Create the solve analysis information */
769aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
770aa372e3fSPaul Mullowney 
771aa372e3fSPaul Mullowney   /* set the operation */
772aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773aa372e3fSPaul Mullowney 
774aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
775aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
776aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
777aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
778aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
779aa372e3fSPaul Mullowney   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
780aa372e3fSPaul Mullowney   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
781aa372e3fSPaul Mullowney   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
782aa372e3fSPaul Mullowney 
783aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
784aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
785aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
786aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
787aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
788aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
789aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
790aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->column_indices->data().get(),
791aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->row_offsets->data().get(),
792aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
793aa372e3fSPaul Mullowney 
794aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
795aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
796aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
797aa372e3fSPaul Mullowney                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
798aa372e3fSPaul Mullowney                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
799aa372e3fSPaul Mullowney                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
800aa372e3fSPaul Mullowney 
801aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
802aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
803aa372e3fSPaul Mullowney 
804aa372e3fSPaul Mullowney   /*********************************************/
805aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
806aa372e3fSPaul Mullowney   /*********************************************/
807aa372e3fSPaul Mullowney 
808aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
809aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
810aa372e3fSPaul Mullowney 
811aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
812aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
813aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
814aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
815aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
816aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
817aa372e3fSPaul Mullowney 
818aa372e3fSPaul Mullowney   /* Create the matrix description */
819aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
820aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
821aa372e3fSPaul Mullowney   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
822aa372e3fSPaul Mullowney   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
823aa372e3fSPaul Mullowney   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
824aa372e3fSPaul Mullowney 
825aa372e3fSPaul Mullowney   /* Create the solve analysis information */
826aa372e3fSPaul Mullowney   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
827aa372e3fSPaul Mullowney 
828aa372e3fSPaul Mullowney   /* set the operation */
829aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
830aa372e3fSPaul Mullowney 
831aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
832aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
833aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
834aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
835aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
836aa372e3fSPaul Mullowney   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
837aa372e3fSPaul Mullowney   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
838aa372e3fSPaul Mullowney   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
839aa372e3fSPaul Mullowney 
840aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
841aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
842aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
843aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
844aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
845aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
846aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
847aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->column_indices->data().get(),
848aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->row_offsets->data().get(),
849aa372e3fSPaul Mullowney                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
850aa372e3fSPaul Mullowney 
851aa372e3fSPaul Mullowney   /* perform the solve analysis on the transposed matrix */
852aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
853aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
854aa372e3fSPaul Mullowney                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
855aa372e3fSPaul Mullowney                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
856aa372e3fSPaul Mullowney                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
857aa372e3fSPaul Mullowney 
858aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
859aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
860bda325fcSPaul Mullowney   PetscFunctionReturn(0);
861bda325fcSPaul Mullowney }
862bda325fcSPaul Mullowney 
863bda325fcSPaul Mullowney #undef __FUNCT__
864bda325fcSPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
865b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
866bda325fcSPaul Mullowney {
867aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
868aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
869aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
870bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
871bda325fcSPaul Mullowney   cusparseStatus_t             stat;
872aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
873b06137fdSPaul Mullowney   cudaError_t                  err;
874b175d8bbSPaul Mullowney 
875bda325fcSPaul Mullowney   PetscFunctionBegin;
876aa372e3fSPaul Mullowney 
877aa372e3fSPaul Mullowney   /* allocate space for the triangular factor information */
878aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879aa372e3fSPaul Mullowney   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
880aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881aa372e3fSPaul Mullowney   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
882aa372e3fSPaul Mullowney   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
883aa372e3fSPaul Mullowney 
884b06137fdSPaul Mullowney   /* set alpha and beta */
885b06137fdSPaul Mullowney   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
886b06137fdSPaul Mullowney   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
887b06137fdSPaul Mullowney   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
888b06137fdSPaul Mullowney   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
889b06137fdSPaul Mullowney   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
890b06137fdSPaul Mullowney 
891aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
892aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
893aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
894aa372e3fSPaul Mullowney     matrixT->num_rows = A->rmap->n;
895aa372e3fSPaul Mullowney     matrixT->num_cols = A->cmap->n;
896aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
897aa372e3fSPaul Mullowney     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
898aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney     /* compute the transpose of the upper triangular factor, i.e. the CSC */
902aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
903aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
904aa372e3fSPaul Mullowney                             matrix->num_cols, matrix->num_entries,
905aa372e3fSPaul Mullowney                             matrix->values->data().get(),
906aa372e3fSPaul Mullowney                             matrix->row_offsets->data().get(),
907aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
908aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
909aa372e3fSPaul Mullowney                             matrixT->column_indices->data().get(),
910aa372e3fSPaul Mullowney                             matrixT->row_offsets->data().get(),
911aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
912aa372e3fSPaul Mullowney 
913aa372e3fSPaul Mullowney     /* assign the pointer */
914aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
9172692e278SPaul Mullowney #if CUDA_VERSION>=5000
918aa372e3fSPaul Mullowney     /* First convert HYB to CSR */
919aa372e3fSPaul Mullowney     CsrMatrix *temp= new CsrMatrix;
920aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
921aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
922aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
923aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
924aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
925aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
926aa372e3fSPaul Mullowney 
9272692e278SPaul Mullowney 
928aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
929aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
930aa372e3fSPaul Mullowney                             temp->values->data().get(),
931aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
932aa372e3fSPaul Mullowney                             temp->column_indices->data().get());CHKERRCUSP(stat);
933aa372e3fSPaul Mullowney 
934aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
935aa372e3fSPaul Mullowney     CsrMatrix *tempT= new CsrMatrix;
936aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
937aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
938aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
939aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
940aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
941aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
942aa372e3fSPaul Mullowney 
943aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
944aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
945aa372e3fSPaul Mullowney                             temp->values->data().get(),
946aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
947aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
948aa372e3fSPaul Mullowney                             tempT->values->data().get(),
949aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
950aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
951aa372e3fSPaul Mullowney                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
952aa372e3fSPaul Mullowney 
953aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
954aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
955aa372e3fSPaul Mullowney     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
956aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
957aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
958aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
959aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
960aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
961aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
962aa372e3fSPaul Mullowney                             hybMat, 0, partition);CHKERRCUSP(stat);
963aa372e3fSPaul Mullowney 
964aa372e3fSPaul Mullowney     /* assign the pointer */
965aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
966aa372e3fSPaul Mullowney 
967aa372e3fSPaul Mullowney     /* delete temporaries */
968aa372e3fSPaul Mullowney     if (tempT) {
969aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
973087f3262SPaul Mullowney     }
974aa372e3fSPaul Mullowney     if (temp) {
975aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
976aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
979aa372e3fSPaul Mullowney     }
9802692e278SPaul Mullowney #else
9812692e278SPaul Mullowney     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
9822692e278SPaul Mullowney #endif
983aa372e3fSPaul Mullowney   }
984aa372e3fSPaul Mullowney   /* assign the compressed row indices */
985aa372e3fSPaul Mullowney   matstructT->cprowIndices = new THRUSTINTARRAY;
986aa372e3fSPaul Mullowney 
987aa372e3fSPaul Mullowney   /* assign the pointer */
988aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
989bda325fcSPaul Mullowney   PetscFunctionReturn(0);
990bda325fcSPaul Mullowney }
991bda325fcSPaul Mullowney 
992bda325fcSPaul Mullowney #undef __FUNCT__
993bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
9946fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
995bda325fcSPaul Mullowney {
996bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU, *bGPU;
997bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
998bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
999aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1000aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1001aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1002b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1003bda325fcSPaul Mullowney 
1004bda325fcSPaul Mullowney   PetscFunctionBegin;
1005aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1006aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1007bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1008aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1009aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1010bda325fcSPaul Mullowney   }
1011bda325fcSPaul Mullowney 
1012bda325fcSPaul Mullowney   /* Get the GPU pointers */
1013bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1014bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1015bda325fcSPaul Mullowney 
1016aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1017aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1018aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1019aa372e3fSPaul Mullowney                xGPU->begin());
1020aa372e3fSPaul Mullowney 
1021aa372e3fSPaul Mullowney   /* First, solve U */
1022aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1023aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1024aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1025aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1026aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1027aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1028aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1029aa372e3fSPaul Mullowney 
1030aa372e3fSPaul Mullowney   /* Then, solve L */
1031aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1032aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1033aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1034aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1035aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1036aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1037aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1038aa372e3fSPaul Mullowney 
1039aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1040aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1041aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1042aa372e3fSPaul Mullowney                tempGPU->begin());
1043aa372e3fSPaul Mullowney 
1044aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1045aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1046bda325fcSPaul Mullowney 
1047bda325fcSPaul Mullowney   /* restore */
1048bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1049bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1050bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1051087f3262SPaul Mullowney 
1052aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1053bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1054bda325fcSPaul Mullowney }
1055bda325fcSPaul Mullowney 
1056bda325fcSPaul Mullowney #undef __FUNCT__
1057bda325fcSPaul Mullowney #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
10586fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1059bda325fcSPaul Mullowney {
1060bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
1061bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1062bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1067bda325fcSPaul Mullowney 
1068bda325fcSPaul Mullowney   PetscFunctionBegin;
1069aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1070aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1071bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1072aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074bda325fcSPaul Mullowney   }
1075bda325fcSPaul Mullowney 
1076bda325fcSPaul Mullowney   /* Get the GPU pointers */
1077bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1078bda325fcSPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1079bda325fcSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* First, solve U */
1081aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1082aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1083aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1084aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1085aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1086aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1087aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* Then, solve L */
1090aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1092aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1093aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1094aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1095aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1096aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1097bda325fcSPaul Mullowney 
1098bda325fcSPaul Mullowney   /* restore */
1099bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1100bda325fcSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1101bda325fcSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1102aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1103bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1104bda325fcSPaul Mullowney }
1105bda325fcSPaul Mullowney 
11069ae82921SPaul Mullowney #undef __FUNCT__
11079ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
11086fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
11099ae82921SPaul Mullowney {
1110bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
11119ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1113aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1114aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1115aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1116b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11179ae82921SPaul Mullowney 
11189ae82921SPaul Mullowney   PetscFunctionBegin;
1119e057df02SPaul Mullowney   /* Get the GPU pointers */
11209ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11219ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
11229ae82921SPaul Mullowney 
1123aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1124aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1125aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1126aa372e3fSPaul Mullowney                xGPU->begin());
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* Next, solve L */
1129aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1130aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1131aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1132aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1133aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1134aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1135aa372e3fSPaul Mullowney                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* Then, solve U */
1138aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1139aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1140aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1141aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1142aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1143aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1144aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1145aa372e3fSPaul Mullowney 
1146aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1147aa372e3fSPaul Mullowney   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1148aa372e3fSPaul Mullowney                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1149aa372e3fSPaul Mullowney                tempGPU->begin());
1150aa372e3fSPaul Mullowney 
1151aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1152aa372e3fSPaul Mullowney   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
11539ae82921SPaul Mullowney 
11549ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
11559ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11569ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1157aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
11589ae82921SPaul Mullowney   PetscFunctionReturn(0);
11599ae82921SPaul Mullowney }
11609ae82921SPaul Mullowney 
11619ae82921SPaul Mullowney #undef __FUNCT__
11629ae82921SPaul Mullowney #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
11636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
11649ae82921SPaul Mullowney {
1165bda325fcSPaul Mullowney   CUSPARRAY                         *xGPU,*bGPU;
11669ae82921SPaul Mullowney   cusparseStatus_t                  stat;
11679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1168aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1169aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1170aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1171b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
11729ae82921SPaul Mullowney 
11739ae82921SPaul Mullowney   PetscFunctionBegin;
1174e057df02SPaul Mullowney   /* Get the GPU pointers */
11759ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11769ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
11779ae82921SPaul Mullowney 
1178aa372e3fSPaul Mullowney   /* First, solve L */
1179aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1180aa372e3fSPaul Mullowney                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1181aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1182aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1183aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1184aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1185aa372e3fSPaul Mullowney                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1186aa372e3fSPaul Mullowney 
1187aa372e3fSPaul Mullowney   /* Next, solve U */
1188aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1189aa372e3fSPaul Mullowney                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1190aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1191aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1192aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1193aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1194aa372e3fSPaul Mullowney                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
11959ae82921SPaul Mullowney 
11969ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
11979ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
11989ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1199aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
12009ae82921SPaul Mullowney   PetscFunctionReturn(0);
12019ae82921SPaul Mullowney }
12029ae82921SPaul Mullowney 
12039ae82921SPaul Mullowney #undef __FUNCT__
1204e057df02SPaul Mullowney #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
12056fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
12069ae82921SPaul Mullowney {
12079ae82921SPaul Mullowney 
1208aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1209aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
12109ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
12119ae82921SPaul Mullowney   PetscInt                     m = A->rmap->n,*ii,*ridx;
12129ae82921SPaul Mullowney   PetscErrorCode               ierr;
1213aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1214b06137fdSPaul Mullowney   cudaError_t                  err;
12159ae82921SPaul Mullowney 
12169ae82921SPaul Mullowney   PetscFunctionBegin;
12179ae82921SPaul Mullowney   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
12189ae82921SPaul Mullowney     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1219aa372e3fSPaul Mullowney     if (matstruct) {
1220aa372e3fSPaul Mullowney       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
12219ae82921SPaul Mullowney     }
12229ae82921SPaul Mullowney     try {
1223aa372e3fSPaul Mullowney       cusparsestruct->nonzerorow=0;
1224aa372e3fSPaul Mullowney       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
12259ae82921SPaul Mullowney 
12269ae82921SPaul Mullowney       if (a->compressedrow.use) {
12279ae82921SPaul Mullowney         m    = a->compressedrow.nrows;
12289ae82921SPaul Mullowney         ii   = a->compressedrow.i;
12299ae82921SPaul Mullowney         ridx = a->compressedrow.rindex;
12309ae82921SPaul Mullowney       } else {
1231b06137fdSPaul Mullowney         /* Forcing compressed row on the GPU */
12329ae82921SPaul Mullowney         int k=0;
1233785e854fSJed Brown         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1234785e854fSJed Brown         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
12359ae82921SPaul Mullowney         ii[0]=0;
12369ae82921SPaul Mullowney         for (int j = 0; j<m; j++) {
12379ae82921SPaul Mullowney           if ((a->i[j+1]-a->i[j])>0) {
12389ae82921SPaul Mullowney             ii[k]  = a->i[j];
12399ae82921SPaul Mullowney             ridx[k]= j;
12409ae82921SPaul Mullowney             k++;
12419ae82921SPaul Mullowney           }
12429ae82921SPaul Mullowney         }
1243aa372e3fSPaul Mullowney         ii[cusparsestruct->nonzerorow] = a->nz;
1244aa372e3fSPaul Mullowney         m = cusparsestruct->nonzerorow;
12459ae82921SPaul Mullowney       }
12469ae82921SPaul Mullowney 
1247aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
1248aa372e3fSPaul Mullowney       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1249aa372e3fSPaul Mullowney       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1250aa372e3fSPaul Mullowney       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1251aa372e3fSPaul Mullowney       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
12529ae82921SPaul Mullowney 
1253b06137fdSPaul Mullowney       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1254b06137fdSPaul Mullowney       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1255b06137fdSPaul Mullowney       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1256b06137fdSPaul Mullowney       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1257b06137fdSPaul Mullowney       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1258b06137fdSPaul Mullowney 
1259aa372e3fSPaul Mullowney       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1260aa372e3fSPaul Mullowney       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1261aa372e3fSPaul Mullowney /* set the matrix */
1262aa372e3fSPaul Mullowney         CsrMatrix *matrix= new CsrMatrix;
1263a65300a6SPaul Mullowney         matrix->num_rows = m;
1264aa372e3fSPaul Mullowney         matrix->num_cols = A->cmap->n;
1265aa372e3fSPaul Mullowney         matrix->num_entries = a->nz;
1266a65300a6SPaul Mullowney         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1267a65300a6SPaul Mullowney         matrix->row_offsets->assign(ii, ii + m+1);
12689ae82921SPaul Mullowney 
1269aa372e3fSPaul Mullowney         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1270aa372e3fSPaul Mullowney         matrix->column_indices->assign(a->j, a->j+a->nz);
1271aa372e3fSPaul Mullowney 
1272aa372e3fSPaul Mullowney         matrix->values = new THRUSTARRAY(a->nz);
1273aa372e3fSPaul Mullowney         matrix->values->assign(a->a, a->a+a->nz);
1274aa372e3fSPaul Mullowney 
1275aa372e3fSPaul Mullowney /* assign the pointer */
1276aa372e3fSPaul Mullowney         matstruct->mat = matrix;
1277aa372e3fSPaul Mullowney 
1278aa372e3fSPaul Mullowney       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
12792692e278SPaul Mullowney #if CUDA_VERSION>=4020
1280aa372e3fSPaul Mullowney         CsrMatrix *matrix= new CsrMatrix;
1281a65300a6SPaul Mullowney         matrix->num_rows = m;
1282aa372e3fSPaul Mullowney         matrix->num_cols = A->cmap->n;
1283aa372e3fSPaul Mullowney         matrix->num_entries = a->nz;
1284a65300a6SPaul Mullowney         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1285a65300a6SPaul Mullowney         matrix->row_offsets->assign(ii, ii + m+1);
1286aa372e3fSPaul Mullowney 
1287aa372e3fSPaul Mullowney         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1288aa372e3fSPaul Mullowney         matrix->column_indices->assign(a->j, a->j+a->nz);
1289aa372e3fSPaul Mullowney 
1290aa372e3fSPaul Mullowney         matrix->values = new THRUSTARRAY(a->nz);
1291aa372e3fSPaul Mullowney         matrix->values->assign(a->a, a->a+a->nz);
1292aa372e3fSPaul Mullowney 
1293aa372e3fSPaul Mullowney         cusparseHybMat_t hybMat;
1294aa372e3fSPaul Mullowney         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1295aa372e3fSPaul Mullowney         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1296aa372e3fSPaul Mullowney           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1297a65300a6SPaul Mullowney         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1298aa372e3fSPaul Mullowney                                 matstruct->descr, matrix->values->data().get(),
1299aa372e3fSPaul Mullowney                                 matrix->row_offsets->data().get(),
1300aa372e3fSPaul Mullowney                                 matrix->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                                 hybMat, 0, partition);CHKERRCUSP(stat);
1302aa372e3fSPaul Mullowney         /* assign the pointer */
1303aa372e3fSPaul Mullowney         matstruct->mat = hybMat;
1304aa372e3fSPaul Mullowney 
1305aa372e3fSPaul Mullowney         if (matrix) {
1306aa372e3fSPaul Mullowney           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1307aa372e3fSPaul Mullowney           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1308aa372e3fSPaul Mullowney           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1309aa372e3fSPaul Mullowney           delete (CsrMatrix*)matrix;
1310087f3262SPaul Mullowney         }
13112692e278SPaul Mullowney #endif
1312087f3262SPaul Mullowney       }
1313ca45077fSPaul Mullowney 
1314aa372e3fSPaul Mullowney       /* assign the compressed row indices */
1315aa372e3fSPaul Mullowney       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1316aa372e3fSPaul Mullowney       matstruct->cprowIndices->assign(ridx,ridx+m);
1317aa372e3fSPaul Mullowney 
1318aa372e3fSPaul Mullowney       /* assign the pointer */
1319aa372e3fSPaul Mullowney       cusparsestruct->mat = matstruct;
1320aa372e3fSPaul Mullowney 
13219ae82921SPaul Mullowney       if (!a->compressedrow.use) {
13229ae82921SPaul Mullowney         ierr = PetscFree(ii);CHKERRQ(ierr);
13239ae82921SPaul Mullowney         ierr = PetscFree(ridx);CHKERRQ(ierr);
13249ae82921SPaul Mullowney       }
1325aa372e3fSPaul Mullowney       cusparsestruct->workVector = new THRUSTARRAY;
1326aa372e3fSPaul Mullowney       cusparsestruct->workVector->resize(m);
13279ae82921SPaul Mullowney     } catch(char *ex) {
13289ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
13299ae82921SPaul Mullowney     }
13309ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
13312205254eSKarl Rupp 
13329ae82921SPaul Mullowney     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
13332205254eSKarl Rupp 
13349ae82921SPaul Mullowney     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
13359ae82921SPaul Mullowney   }
13369ae82921SPaul Mullowney   PetscFunctionReturn(0);
13379ae82921SPaul Mullowney }
13389ae82921SPaul Mullowney 
13399ae82921SPaul Mullowney #undef __FUNCT__
13402a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
13412a7a6963SBarry Smith static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
13429ae82921SPaul Mullowney {
13439ae82921SPaul Mullowney   PetscErrorCode ierr;
134433d57670SJed Brown   PetscInt rbs,cbs;
13459ae82921SPaul Mullowney 
13469ae82921SPaul Mullowney   PetscFunctionBegin;
134733d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
13489ae82921SPaul Mullowney   if (right) {
1349ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
13509ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
135133d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
13529ae82921SPaul Mullowney     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
13539ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
13549ae82921SPaul Mullowney   }
13559ae82921SPaul Mullowney   if (left) {
1356ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
13579ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
135833d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
13599ae82921SPaul Mullowney     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
13609ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
13619ae82921SPaul Mullowney   }
13629ae82921SPaul Mullowney   PetscFunctionReturn(0);
13639ae82921SPaul Mullowney }
13649ae82921SPaul Mullowney 
1365aa372e3fSPaul Mullowney struct VecCUSPPlusEquals
1366aa372e3fSPaul Mullowney {
1367aa372e3fSPaul Mullowney   template <typename Tuple>
1368aa372e3fSPaul Mullowney   __host__ __device__
1369aa372e3fSPaul Mullowney   void operator()(Tuple t)
1370aa372e3fSPaul Mullowney   {
1371aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372aa372e3fSPaul Mullowney   }
1373aa372e3fSPaul Mullowney };
1374aa372e3fSPaul Mullowney 
13759ae82921SPaul Mullowney #undef __FUNCT__
13769ae82921SPaul Mullowney #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
13776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
13789ae82921SPaul Mullowney {
13799ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1380aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1381aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1382bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1383b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1384aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
13859ae82921SPaul Mullowney 
13869ae82921SPaul Mullowney   PetscFunctionBegin;
1387e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1388e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
13899ae82921SPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
13909ae82921SPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1391aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1392aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1393aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1394aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols, mat->num_entries,
1395b06137fdSPaul Mullowney                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1396b06137fdSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1397aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1398aa372e3fSPaul Mullowney   } else {
13992692e278SPaul Mullowney #if CUDA_VERSION>=4020
1400aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1401aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1402b06137fdSPaul Mullowney                              matstruct->alpha, matstruct->descr, hybMat,
1403b06137fdSPaul Mullowney                              xarray->data().get(), matstruct->beta,
1404aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
14052692e278SPaul Mullowney #endif
14069ae82921SPaul Mullowney   }
14079ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
14089ae82921SPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1409aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
14109ae82921SPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1411ca45077fSPaul Mullowney   }
1412aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
14139ae82921SPaul Mullowney   PetscFunctionReturn(0);
14149ae82921SPaul Mullowney }
14159ae82921SPaul Mullowney 
14169ae82921SPaul Mullowney #undef __FUNCT__
1417ca45077fSPaul Mullowney #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
14186fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1419ca45077fSPaul Mullowney {
1420ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1421aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1422aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1423bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray;
1424b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1425aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1426ca45077fSPaul Mullowney 
1427ca45077fSPaul Mullowney   PetscFunctionBegin;
1428e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1429e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1430aa372e3fSPaul Mullowney   if (!matstructT) {
1431bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1432aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1433bda325fcSPaul Mullowney   }
1434ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1435ca45077fSPaul Mullowney   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1436aa372e3fSPaul Mullowney 
1437aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1438aa372e3fSPaul Mullowney     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1439aa372e3fSPaul Mullowney     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1440aa372e3fSPaul Mullowney                              mat->num_rows, mat->num_cols,
1441b06137fdSPaul Mullowney                              mat->num_entries, matstructT->alpha, matstructT->descr,
1442aa372e3fSPaul Mullowney                              mat->values->data().get(), mat->row_offsets->data().get(),
1443b06137fdSPaul Mullowney                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1444aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
1445aa372e3fSPaul Mullowney   } else {
14462692e278SPaul Mullowney #if CUDA_VERSION>=4020
1447aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1448aa372e3fSPaul Mullowney     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449b06137fdSPaul Mullowney                              matstructT->alpha, matstructT->descr, hybMat,
1450b06137fdSPaul Mullowney                              xarray->data().get(), matstructT->beta,
1451aa372e3fSPaul Mullowney                              yarray->data().get());CHKERRCUSP(stat);
14522692e278SPaul Mullowney #endif
1453ca45077fSPaul Mullowney   }
1454ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1455ca45077fSPaul Mullowney   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1456aa372e3fSPaul Mullowney   if (!cusparsestruct->stream) {
1457ca45077fSPaul Mullowney     ierr = WaitForGPU();CHKERRCUSP(ierr);
1458ca45077fSPaul Mullowney   }
1459aa372e3fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1460ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1461ca45077fSPaul Mullowney }
1462ca45077fSPaul Mullowney 
1463aa372e3fSPaul Mullowney 
1464ca45077fSPaul Mullowney #undef __FUNCT__
14659ae82921SPaul Mullowney #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
14666fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
14679ae82921SPaul Mullowney {
14689ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1469aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1470aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1471bda325fcSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1472b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1473aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
14746e111a19SKarl Rupp 
14759ae82921SPaul Mullowney   PetscFunctionBegin;
1476e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1477e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
14789ae82921SPaul Mullowney   try {
14799ae82921SPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
14809ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
14819ae82921SPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
14829ae82921SPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
14839ae82921SPaul Mullowney 
1484e057df02SPaul Mullowney     /* multiply add */
1485aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1486aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1487b06137fdSPaul Mullowney     /* here we need to be careful to set the number of rows in the multiply to the
1488b06137fdSPaul Mullowney        number of compressed rows in the matrix ... which is equivalent to the
1489b06137fdSPaul Mullowney        size of the workVector */
1490aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1491a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1492b06137fdSPaul Mullowney                                mat->num_entries, matstruct->alpha, matstruct->descr,
1493aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1494b06137fdSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1495aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1496aa372e3fSPaul Mullowney     } else {
14972692e278SPaul Mullowney #if CUDA_VERSION>=4020
1498aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1499a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1500aa372e3fSPaul Mullowney         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1501b06137fdSPaul Mullowney             matstruct->alpha, matstruct->descr, hybMat,
1502b06137fdSPaul Mullowney             xarray->data().get(), matstruct->beta,
1503aa372e3fSPaul Mullowney             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1504a65300a6SPaul Mullowney       }
15052692e278SPaul Mullowney #endif
1506aa372e3fSPaul Mullowney     }
1507aa372e3fSPaul Mullowney 
1508aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1509aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1510aa372e3fSPaul Mullowney         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1511aa372e3fSPaul Mullowney         VecCUSPPlusEquals());
15129ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
15139ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
15149ae82921SPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
15159ae82921SPaul Mullowney 
15169ae82921SPaul Mullowney   } catch(char *ex) {
15179ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
15189ae82921SPaul Mullowney   }
15199ae82921SPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
15209ae82921SPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
15219ae82921SPaul Mullowney   PetscFunctionReturn(0);
15229ae82921SPaul Mullowney }
15239ae82921SPaul Mullowney 
15249ae82921SPaul Mullowney #undef __FUNCT__
1525b175d8bbSPaul Mullowney #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
15266fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1527ca45077fSPaul Mullowney {
1528ca45077fSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1529aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1530aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1531ca45077fSPaul Mullowney   CUSPARRAY                    *xarray,*yarray,*zarray;
1532b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
1533aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
15346e111a19SKarl Rupp 
1535ca45077fSPaul Mullowney   PetscFunctionBegin;
1536e057df02SPaul Mullowney   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1537e057df02SPaul Mullowney      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1538aa372e3fSPaul Mullowney   if (!matstructT) {
1539bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1540aa372e3fSPaul Mullowney     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1541bda325fcSPaul Mullowney   }
1542aa372e3fSPaul Mullowney 
1543ca45077fSPaul Mullowney   try {
1544ca45077fSPaul Mullowney     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1545ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1546ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1547ca45077fSPaul Mullowney     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1548ca45077fSPaul Mullowney 
1549e057df02SPaul Mullowney     /* multiply add with matrix transpose */
1550aa372e3fSPaul Mullowney     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1551aa372e3fSPaul Mullowney       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1552b06137fdSPaul Mullowney       /* here we need to be careful to set the number of rows in the multiply to the
1553b06137fdSPaul Mullowney          number of compressed rows in the matrix ... which is equivalent to the
1554b06137fdSPaul Mullowney          size of the workVector */
1555aa372e3fSPaul Mullowney       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1556a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
1557b06137fdSPaul Mullowney                                mat->num_entries, matstructT->alpha, matstructT->descr,
1558aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
1559b06137fdSPaul Mullowney                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1560aa372e3fSPaul Mullowney                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1561aa372e3fSPaul Mullowney     } else {
15622692e278SPaul Mullowney #if CUDA_VERSION>=4020
1563aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1564a65300a6SPaul Mullowney       if (cusparsestruct->workVector->size()) {
1565aa372e3fSPaul Mullowney         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1566b06137fdSPaul Mullowney             matstructT->alpha, matstructT->descr, hybMat,
1567b06137fdSPaul Mullowney             xarray->data().get(), matstructT->beta,
1568aa372e3fSPaul Mullowney             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1569a65300a6SPaul Mullowney       }
15702692e278SPaul Mullowney #endif
1571aa372e3fSPaul Mullowney     }
1572aa372e3fSPaul Mullowney 
1573aa372e3fSPaul Mullowney     /* scatter the data from the temporary into the full vector with a += operation */
1574aa372e3fSPaul Mullowney     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1575aa372e3fSPaul Mullowney         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1576aa372e3fSPaul Mullowney         VecCUSPPlusEquals());
1577ca45077fSPaul Mullowney 
1578ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1579ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1580ca45077fSPaul Mullowney     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1581ca45077fSPaul Mullowney 
1582ca45077fSPaul Mullowney   } catch(char *ex) {
1583ca45077fSPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1584ca45077fSPaul Mullowney   }
1585ca45077fSPaul Mullowney   ierr = WaitForGPU();CHKERRCUSP(ierr);
1586ca45077fSPaul Mullowney   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1587ca45077fSPaul Mullowney   PetscFunctionReturn(0);
1588ca45077fSPaul Mullowney }
1589ca45077fSPaul Mullowney 
1590ca45077fSPaul Mullowney #undef __FUNCT__
15919ae82921SPaul Mullowney #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
15926fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
15939ae82921SPaul Mullowney {
15949ae82921SPaul Mullowney   PetscErrorCode ierr;
15956e111a19SKarl Rupp 
15969ae82921SPaul Mullowney   PetscFunctionBegin;
15979ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1598bc3f50f2SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
1599e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1600bc3f50f2SPaul Mullowney   }
16019ae82921SPaul Mullowney   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1602bbf3fe20SPaul Mullowney   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1603bbf3fe20SPaul Mullowney   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1604bbf3fe20SPaul Mullowney   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1605bbf3fe20SPaul Mullowney   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
16069ae82921SPaul Mullowney   PetscFunctionReturn(0);
16079ae82921SPaul Mullowney }
16089ae82921SPaul Mullowney 
16099ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
16109ae82921SPaul Mullowney #undef __FUNCT__
16119ae82921SPaul Mullowney #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1612e057df02SPaul Mullowney /*@
16139ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1614e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
1615e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1616e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1617e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1618e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
16199ae82921SPaul Mullowney 
16209ae82921SPaul Mullowney    Collective on MPI_Comm
16219ae82921SPaul Mullowney 
16229ae82921SPaul Mullowney    Input Parameters:
16239ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
16249ae82921SPaul Mullowney .  m - number of rows
16259ae82921SPaul Mullowney .  n - number of columns
16269ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
16279ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
16280298fd71SBarry Smith          (possibly different for each row) or NULL
16299ae82921SPaul Mullowney 
16309ae82921SPaul Mullowney    Output Parameter:
16319ae82921SPaul Mullowney .  A - the matrix
16329ae82921SPaul Mullowney 
16339ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
16349ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
16359ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
16369ae82921SPaul Mullowney 
16379ae82921SPaul Mullowney    Notes:
16389ae82921SPaul Mullowney    If nnz is given then nz is ignored
16399ae82921SPaul Mullowney 
16409ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
16419ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
16429ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
16439ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
16449ae82921SPaul Mullowney 
16459ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
16460298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
16479ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
16489ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
16499ae82921SPaul Mullowney 
16509ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
16519ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
16529ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
16539ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
16549ae82921SPaul Mullowney 
16559ae82921SPaul Mullowney    Level: intermediate
16569ae82921SPaul Mullowney 
1657e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
16589ae82921SPaul Mullowney @*/
16599ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
16609ae82921SPaul Mullowney {
16619ae82921SPaul Mullowney   PetscErrorCode ierr;
16629ae82921SPaul Mullowney 
16639ae82921SPaul Mullowney   PetscFunctionBegin;
16649ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
16659ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
16669ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
16679ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
16689ae82921SPaul Mullowney   PetscFunctionReturn(0);
16699ae82921SPaul Mullowney }
16709ae82921SPaul Mullowney 
16719ae82921SPaul Mullowney #undef __FUNCT__
16729ae82921SPaul Mullowney #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
16736fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
16749ae82921SPaul Mullowney {
16759ae82921SPaul Mullowney   PetscErrorCode   ierr;
1676aa372e3fSPaul Mullowney   cusparseStatus_t stat;
1677b06137fdSPaul Mullowney   cudaError_t      err;
16789ae82921SPaul Mullowney   PetscFunctionBegin;
16799ae82921SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
16809ae82921SPaul Mullowney     try {
16819ae82921SPaul Mullowney       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1682aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1683aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1684aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1685aa372e3fSPaul Mullowney 
1686aa372e3fSPaul Mullowney         /* delete all the members in each of the data structures */
1687aa372e3fSPaul Mullowney         if (matstruct) {
1688aa372e3fSPaul Mullowney           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1689aa372e3fSPaul Mullowney           if (matstruct->mat) {
1690aa372e3fSPaul Mullowney             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
16912692e278SPaul Mullowney #if CUDA_VERSION>=4020
1692aa372e3fSPaul Mullowney               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1693aa372e3fSPaul Mullowney               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
16942692e278SPaul Mullowney #endif
1695aa372e3fSPaul Mullowney             } else {
1696aa372e3fSPaul Mullowney               CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1697aa372e3fSPaul Mullowney               if (mat->values) delete (THRUSTARRAY*)mat->values;
1698aa372e3fSPaul Mullowney               if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1699aa372e3fSPaul Mullowney               if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1700aa372e3fSPaul Mullowney               delete (CsrMatrix*)matstruct->mat;
17019ae82921SPaul Mullowney             }
1702aa372e3fSPaul Mullowney           }
1703b06137fdSPaul Mullowney           if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1704b06137fdSPaul Mullowney           if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1705aa372e3fSPaul Mullowney           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1706aa372e3fSPaul Mullowney           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1707aa372e3fSPaul Mullowney         }
1708aa372e3fSPaul Mullowney         if (matstructT) {
1709aa372e3fSPaul Mullowney           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1710aa372e3fSPaul Mullowney           if (matstructT->mat) {
1711aa372e3fSPaul Mullowney             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
17122692e278SPaul Mullowney #if CUDA_VERSION>=4020
1713aa372e3fSPaul Mullowney               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1714aa372e3fSPaul Mullowney               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
17152692e278SPaul Mullowney #endif
1716aa372e3fSPaul Mullowney             } else {
1717aa372e3fSPaul Mullowney               CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1718aa372e3fSPaul Mullowney               if (matT->values) delete (THRUSTARRAY*)matT->values;
1719aa372e3fSPaul Mullowney               if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1720aa372e3fSPaul Mullowney               if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1721aa372e3fSPaul Mullowney               delete (CsrMatrix*)matstructT->mat;
1722aa372e3fSPaul Mullowney             }
1723aa372e3fSPaul Mullowney           }
1724b06137fdSPaul Mullowney           if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1725b06137fdSPaul Mullowney           if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1726aa372e3fSPaul Mullowney           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1727aa372e3fSPaul Mullowney           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1728aa372e3fSPaul Mullowney         }
1729aa372e3fSPaul Mullowney         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1730aa372e3fSPaul Mullowney         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1731aa372e3fSPaul Mullowney         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
17329ae82921SPaul Mullowney         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1733aa372e3fSPaul Mullowney       } else {
1734aa372e3fSPaul Mullowney         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1735aa372e3fSPaul Mullowney         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1736aa372e3fSPaul Mullowney         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1737aa372e3fSPaul Mullowney       }
17389ae82921SPaul Mullowney     } catch(char *ex) {
17399ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
17409ae82921SPaul Mullowney     }
17419ae82921SPaul Mullowney   } else {
1742e057df02SPaul Mullowney     /* The triangular factors */
17439ae82921SPaul Mullowney     try {
17449ae82921SPaul Mullowney       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1745aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1746aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1747aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1748aa372e3fSPaul Mullowney       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1749aa372e3fSPaul Mullowney 
1750aa372e3fSPaul Mullowney       /* delete all the members in each of the data structures */
1751aa372e3fSPaul Mullowney       if (loTriFactor) {
1752aa372e3fSPaul Mullowney         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1753aa372e3fSPaul Mullowney         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1754aa372e3fSPaul Mullowney         if (loTriFactor->csrMat) {
1755aa372e3fSPaul Mullowney           if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1756aa372e3fSPaul Mullowney           if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1757aa372e3fSPaul Mullowney           if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1758aa372e3fSPaul Mullowney           delete (CsrMatrix*)loTriFactor->csrMat;
1759aa372e3fSPaul Mullowney         }
1760aa372e3fSPaul Mullowney         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1761aa372e3fSPaul Mullowney       }
1762aa372e3fSPaul Mullowney 
1763aa372e3fSPaul Mullowney       if (upTriFactor) {
1764aa372e3fSPaul Mullowney         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1765aa372e3fSPaul Mullowney         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1766aa372e3fSPaul Mullowney         if (upTriFactor->csrMat) {
1767aa372e3fSPaul Mullowney           if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1768aa372e3fSPaul Mullowney           if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1769aa372e3fSPaul Mullowney           if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1770aa372e3fSPaul Mullowney           delete (CsrMatrix*)upTriFactor->csrMat;
1771aa372e3fSPaul Mullowney         }
1772aa372e3fSPaul Mullowney         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1773aa372e3fSPaul Mullowney       }
1774aa372e3fSPaul Mullowney 
1775aa372e3fSPaul Mullowney       if (loTriFactorT) {
1776aa372e3fSPaul Mullowney         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1777aa372e3fSPaul Mullowney         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1778aa372e3fSPaul Mullowney         if (loTriFactorT->csrMat) {
1779aa372e3fSPaul Mullowney           if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1780aa372e3fSPaul Mullowney           if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1781aa372e3fSPaul Mullowney           if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1782aa372e3fSPaul Mullowney           delete (CsrMatrix*)loTriFactorT->csrMat;
1783aa372e3fSPaul Mullowney         }
1784aa372e3fSPaul Mullowney         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1785aa372e3fSPaul Mullowney       }
1786aa372e3fSPaul Mullowney 
1787aa372e3fSPaul Mullowney       if (upTriFactorT) {
1788aa372e3fSPaul Mullowney         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1789aa372e3fSPaul Mullowney         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1790aa372e3fSPaul Mullowney         if (upTriFactorT->csrMat) {
1791aa372e3fSPaul Mullowney           if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1792aa372e3fSPaul Mullowney           if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1793aa372e3fSPaul Mullowney           if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1794aa372e3fSPaul Mullowney           delete (CsrMatrix*)upTriFactorT->csrMat;
1795aa372e3fSPaul Mullowney         }
1796aa372e3fSPaul Mullowney         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1797aa372e3fSPaul Mullowney       }
1798aa372e3fSPaul Mullowney       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1799aa372e3fSPaul Mullowney       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1800aa372e3fSPaul Mullowney       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1801aa372e3fSPaul Mullowney       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1802aa372e3fSPaul Mullowney       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
18039ae82921SPaul Mullowney     } catch(char *ex) {
18049ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18059ae82921SPaul Mullowney     }
18069ae82921SPaul Mullowney   }
18079ae82921SPaul Mullowney   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
18089ae82921SPaul Mullowney   A->spptr = 0;
18099ae82921SPaul Mullowney 
18109ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
18119ae82921SPaul Mullowney   PetscFunctionReturn(0);
18129ae82921SPaul Mullowney }
18139ae82921SPaul Mullowney 
18149ae82921SPaul Mullowney #undef __FUNCT__
18159ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
18168cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
18179ae82921SPaul Mullowney {
18189ae82921SPaul Mullowney   PetscErrorCode ierr;
1819aa372e3fSPaul Mullowney   cusparseStatus_t stat;
1820aa372e3fSPaul Mullowney   cusparseHandle_t handle=0;
18219ae82921SPaul Mullowney 
18229ae82921SPaul Mullowney   PetscFunctionBegin;
18239ae82921SPaul Mullowney   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
18249ae82921SPaul Mullowney   if (B->factortype==MAT_FACTOR_NONE) {
1825e057df02SPaul Mullowney     /* you cannot check the inode.use flag here since the matrix was just created.
1826e057df02SPaul Mullowney        now build a GPU matrix data structure */
18279ae82921SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSE;
18289ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1829aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1830aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1831e057df02SPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1832aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1833aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1834aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1835aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1836aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
18379ae82921SPaul Mullowney   } else {
18389ae82921SPaul Mullowney     /* NEXT, set the pointers to the triangular factors */
1839debe9ee2SPaul Mullowney     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
18409ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
18419ae82921SPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1842aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1843aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1844aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1845aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1846aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1847aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1848aa372e3fSPaul Mullowney     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1849aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1850aa372e3fSPaul Mullowney     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
18519ae82921SPaul Mullowney   }
1852aa372e3fSPaul Mullowney 
1853e057df02SPaul Mullowney   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1854e057df02SPaul Mullowney      default cusparse tri solve. Note the difference with the implementation in
1855e057df02SPaul Mullowney      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1856bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
18572205254eSKarl Rupp 
18589ae82921SPaul Mullowney   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
18599ae82921SPaul Mullowney   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
18602a7a6963SBarry Smith   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
18619ae82921SPaul Mullowney   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1862ca45077fSPaul Mullowney   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1863ca45077fSPaul Mullowney   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1864ca45077fSPaul Mullowney   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1865ca45077fSPaul Mullowney   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
18662205254eSKarl Rupp 
18679ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
18682205254eSKarl Rupp 
18699ae82921SPaul Mullowney   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
18702205254eSKarl Rupp 
1871bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
18729ae82921SPaul Mullowney   PetscFunctionReturn(0);
18739ae82921SPaul Mullowney }
18749ae82921SPaul Mullowney 
1875e057df02SPaul Mullowney /*M
1876e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1877e057df02SPaul Mullowney 
1878e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
18792692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
18802692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1881e057df02SPaul Mullowney 
1882e057df02SPaul Mullowney    Options Database Keys:
1883e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1884aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1885aa372e3fSPaul Mullowney .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1886e057df02SPaul Mullowney 
1887e057df02SPaul Mullowney   Level: beginner
1888e057df02SPaul Mullowney 
18898468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1890e057df02SPaul Mullowney M*/
1891