xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision dcf908cab9b0ea5e30ac5791f6aa2fcfe1440bec)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
278         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355     try {
356       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357       nzUpper = adiag[0]-adiag[n];
358 
359       /* Allocate Space for the upper triangular matrix */
360       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
363 
364       /* Fill the upper triangular matrix */
365       AiUp[0]=(PetscInt) 0;
366       AiUp[n]=nzUpper;
367       offset = nzUpper;
368       for (i=n-1; i>=0; i--) {
369         v  = aa + adiag[i+1] + 1;
370         vi = aj + adiag[i+1] + 1;
371 
372         /* number of elements NOT on the diagonal */
373         nz = adiag[i] - adiag[i+1]-1;
374 
375         /* decrement the offset */
376         offset -= (nz+1);
377 
378         /* first, set the diagonal elements */
379         AjUp[offset] = (PetscInt) i;
380         AAUp[offset] = (MatScalar)1./v[nz];
381         AiUp[i]      = AiUp[i+1] - (nz+1);
382 
383         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
384         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
385       }
386 
387       /* allocate space for the triangular factor information */
388       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
389 
390       /* Create the matrix description */
391       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396 
397       /* Create the solve analysis information */
398       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399 
400       /* set the operation */
401       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402 
403       /* set the matrix */
404       upTriFactor->csrMat = new CsrMatrix;
405       upTriFactor->csrMat->num_rows = n;
406       upTriFactor->csrMat->num_cols = n;
407       upTriFactor->csrMat->num_entries = nzUpper;
408 
409       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411 
412       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414 
415       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417 
418       /* perform the solve analysis */
419       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423 
424       /* assign the pointer. Is this really necessary? */
425       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
426 
427       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476 {
477   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481   cusparseStatus_t                  stat;
482   PetscErrorCode                    ierr;
483   PetscInt                          *AiUp, *AjUp;
484   PetscScalar                       *AAUp;
485   PetscScalar                       *AALo;
486   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
488   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
489   const MatScalar                   *aa = b->a,*v;
490 
491   PetscFunctionBegin;
492   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
493     try {
494       /* Allocate Space for the upper triangular matrix */
495       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
499 
500       /* Fill the upper triangular matrix */
501       AiUp[0]=(PetscInt) 0;
502       AiUp[n]=nzUpper;
503       offset = 0;
504       for (i=0; i<n; i++) {
505         /* set the pointers */
506         v  = aa + ai[i];
507         vj = aj + ai[i];
508         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
509 
510         /* first, set the diagonal elements */
511         AjUp[offset] = (PetscInt) i;
512         AAUp[offset] = (MatScalar)1.0/v[nz];
513         AiUp[i]      = offset;
514         AALo[offset] = (MatScalar)1.0/v[nz];
515 
516         offset+=1;
517         if (nz>0) {
518           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
519           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
520           for (j=offset; j<offset+nz; j++) {
521             AAUp[j] = -AAUp[j];
522             AALo[j] = AAUp[j]/v[nz];
523           }
524           offset+=nz;
525         }
526       }
527 
528       /* allocate space for the triangular factor information */
529       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
530 
531       /* Create the matrix description */
532       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
537 
538       /* Create the solve analysis information */
539       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
540 
541       /* set the operation */
542       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543 
544       /* set the matrix */
545       upTriFactor->csrMat = new CsrMatrix;
546       upTriFactor->csrMat->num_rows = A->rmap->n;
547       upTriFactor->csrMat->num_cols = A->cmap->n;
548       upTriFactor->csrMat->num_entries = a->nz;
549 
550       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
552 
553       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
555 
556       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
558 
559       /* perform the solve analysis */
560       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
564 
565       /* assign the pointer. Is this really necessary? */
566       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
567 
568       /* allocate space for the triangular factor information */
569       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
570 
571       /* Create the matrix description */
572       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
577 
578       /* Create the solve analysis information */
579       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
580 
581       /* set the operation */
582       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
583 
584       /* set the matrix */
585       loTriFactor->csrMat = new CsrMatrix;
586       loTriFactor->csrMat->num_rows = A->rmap->n;
587       loTriFactor->csrMat->num_cols = A->cmap->n;
588       loTriFactor->csrMat->num_entries = a->nz;
589 
590       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
592 
593       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
595 
596       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
598 
599       /* perform the solve analysis */
600       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
604 
605       /* assign the pointer. Is this really necessary? */
606       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
607 
608       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
609       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
613     } catch(char *ex) {
614       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615     }
616   }
617   PetscFunctionReturn(0);
618 }
619 
620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621 {
622   PetscErrorCode               ierr;
623   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   IS                           ip = a->row;
626   const PetscInt               *rip;
627   PetscBool                    perm_identity;
628   PetscInt                     n = A->rmap->n;
629 
630   PetscFunctionBegin;
631   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
632   cusparseTriFactors->workVector = new THRUSTARRAY(n);
633   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
634 
635   /*lower triangular indices */
636   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
637   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
638   if (!perm_identity) {
639     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
640     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
641     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
642     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
643   }
644   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
645   PetscFunctionReturn(0);
646 }
647 
648 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
649 {
650   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
651   IS             isrow = b->row,iscol = b->col;
652   PetscBool      row_identity,col_identity;
653   PetscErrorCode ierr;
654 
655   PetscFunctionBegin;
656   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
657   /* determine which version of MatSolve needs to be used. */
658   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
660   if (row_identity && col_identity) {
661     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
662     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
663   } else {
664     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
665     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
666   }
667 
668   /* get the triangular factors */
669   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
670   PetscFunctionReturn(0);
671 }
672 
673 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
674 {
675   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
676   IS             ip = b->row;
677   PetscBool      perm_identity;
678   PetscErrorCode ierr;
679 
680   PetscFunctionBegin;
681   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
682 
683   /* determine which version of MatSolve needs to be used. */
684   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
685   if (perm_identity) {
686     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
688   } else {
689     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
690     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
699 {
700   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
701   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
705   cusparseStatus_t                  stat;
706   cusparseIndexBase_t               indexBase;
707   cusparseMatrixType_t              matrixType;
708   cusparseFillMode_t                fillMode;
709   cusparseDiagType_t                diagType;
710 
711   PetscFunctionBegin;
712 
713   /*********************************************/
714   /* Now the Transpose of the Lower Tri Factor */
715   /*********************************************/
716 
717   /* allocate space for the transpose of the lower triangular factor */
718   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
719 
720   /* set the matrix descriptors of the lower triangular factor */
721   matrixType = cusparseGetMatType(loTriFactor->descr);
722   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
723   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
724     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
725   diagType = cusparseGetMatDiagType(loTriFactor->descr);
726 
727   /* Create the matrix description */
728   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
729   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
730   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
731   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
732   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
733 
734   /* Create the solve analysis information */
735   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
736 
737   /* set the operation */
738   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
739 
740   /* allocate GPU space for the CSC of the lower triangular factor*/
741   loTriFactorT->csrMat = new CsrMatrix;
742   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
743   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
744   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
745   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
746   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
747   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
748 
749   /* compute the transpose of the lower triangular factor, i.e. the CSC */
750   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
751                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
752                           loTriFactor->csrMat->values->data().get(),
753                           loTriFactor->csrMat->row_offsets->data().get(),
754                           loTriFactor->csrMat->column_indices->data().get(),
755                           loTriFactorT->csrMat->values->data().get(),
756                           loTriFactorT->csrMat->column_indices->data().get(),
757                           loTriFactorT->csrMat->row_offsets->data().get(),
758                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
759 
760   /* perform the solve analysis on the transposed matrix */
761   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
762                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
763                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
764                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
765                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
766 
767   /* assign the pointer. Is this really necessary? */
768   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
769 
770   /*********************************************/
771   /* Now the Transpose of the Upper Tri Factor */
772   /*********************************************/
773 
774   /* allocate space for the transpose of the upper triangular factor */
775   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
776 
777   /* set the matrix descriptors of the upper triangular factor */
778   matrixType = cusparseGetMatType(upTriFactor->descr);
779   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
780   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
781     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
782   diagType = cusparseGetMatDiagType(upTriFactor->descr);
783 
784   /* Create the matrix description */
785   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
786   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
787   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
788   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
789   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
790 
791   /* Create the solve analysis information */
792   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
793 
794   /* set the operation */
795   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
796 
797   /* allocate GPU space for the CSC of the upper triangular factor*/
798   upTriFactorT->csrMat = new CsrMatrix;
799   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
800   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
801   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
802   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
803   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
804   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
805 
806   /* compute the transpose of the upper triangular factor, i.e. the CSC */
807   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
808                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
809                           upTriFactor->csrMat->values->data().get(),
810                           upTriFactor->csrMat->row_offsets->data().get(),
811                           upTriFactor->csrMat->column_indices->data().get(),
812                           upTriFactorT->csrMat->values->data().get(),
813                           upTriFactorT->csrMat->column_indices->data().get(),
814                           upTriFactorT->csrMat->row_offsets->data().get(),
815                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
816 
817   /* perform the solve analysis on the transposed matrix */
818   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
819                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
820                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
821                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
822                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
823 
824   /* assign the pointer. Is this really necessary? */
825   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
826   PetscFunctionReturn(0);
827 }
828 
829 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
830 {
831   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
832   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
833   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
834   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
835   cusparseStatus_t             stat;
836   cusparseIndexBase_t          indexBase;
837   cudaError_t                  err;
838 
839   PetscFunctionBegin;
840 
841   /* allocate space for the triangular factor information */
842   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
843   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
844   indexBase = cusparseGetMatIndexBase(matstruct->descr);
845   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
846   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
847 
848   /* set alpha and beta */
849   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
850   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
851   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
852   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
853   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
854 
855   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
856     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
857     CsrMatrix *matrixT= new CsrMatrix;
858     matrixT->num_rows = A->cmap->n;
859     matrixT->num_cols = A->rmap->n;
860     matrixT->num_entries = a->nz;
861     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
862     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
863     matrixT->values = new THRUSTARRAY(a->nz);
864 
865     /* compute the transpose of the upper triangular factor, i.e. the CSC */
866     indexBase = cusparseGetMatIndexBase(matstruct->descr);
867     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
868                             matrix->num_cols, matrix->num_entries,
869                             matrix->values->data().get(),
870                             matrix->row_offsets->data().get(),
871                             matrix->column_indices->data().get(),
872                             matrixT->values->data().get(),
873                             matrixT->column_indices->data().get(),
874                             matrixT->row_offsets->data().get(),
875                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
876 
877     /* assign the pointer */
878     matstructT->mat = matrixT;
879 
880   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
881 #if CUDA_VERSION>=5000
882     /* First convert HYB to CSR */
883     CsrMatrix *temp= new CsrMatrix;
884     temp->num_rows = A->rmap->n;
885     temp->num_cols = A->cmap->n;
886     temp->num_entries = a->nz;
887     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
888     temp->column_indices = new THRUSTINTARRAY32(a->nz);
889     temp->values = new THRUSTARRAY(a->nz);
890 
891 
892     stat = cusparse_hyb2csr(cusparsestruct->handle,
893                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
894                             temp->values->data().get(),
895                             temp->row_offsets->data().get(),
896                             temp->column_indices->data().get());CHKERRCUDA(stat);
897 
898     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
899     CsrMatrix *tempT= new CsrMatrix;
900     tempT->num_rows = A->rmap->n;
901     tempT->num_cols = A->cmap->n;
902     tempT->num_entries = a->nz;
903     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
905     tempT->values = new THRUSTARRAY(a->nz);
906 
907     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
908                             temp->num_cols, temp->num_entries,
909                             temp->values->data().get(),
910                             temp->row_offsets->data().get(),
911                             temp->column_indices->data().get(),
912                             tempT->values->data().get(),
913                             tempT->column_indices->data().get(),
914                             tempT->row_offsets->data().get(),
915                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
916 
917     /* Last, convert CSC to HYB */
918     cusparseHybMat_t hybMat;
919     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
920     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
921       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
922     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
923                             matstructT->descr, tempT->values->data().get(),
924                             tempT->row_offsets->data().get(),
925                             tempT->column_indices->data().get(),
926                             hybMat, 0, partition);CHKERRCUDA(stat);
927 
928     /* assign the pointer */
929     matstructT->mat = hybMat;
930 
931     /* delete temporaries */
932     if (tempT) {
933       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
934       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
935       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
936       delete (CsrMatrix*) tempT;
937     }
938     if (temp) {
939       if (temp->values) delete (THRUSTARRAY*) temp->values;
940       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
941       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
942       delete (CsrMatrix*) temp;
943     }
944 #else
945     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
946 #endif
947   }
948   /* assign the compressed row indices */
949   matstructT->cprowIndices = new THRUSTINTARRAY;
950   matstructT->cprowIndices->resize(A->cmap->n);
951   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
952 
953   /* assign the pointer */
954   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
955   PetscFunctionReturn(0);
956 }
957 
958 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
959 {
960   PetscInt                              n = xx->map->n;
961   const PetscScalar                     *barray;
962   PetscScalar                           *xarray;
963   thrust::device_ptr<const PetscScalar> bGPU;
964   thrust::device_ptr<PetscScalar>       xGPU;
965   cusparseStatus_t                      stat;
966   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
967   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
968   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
969   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
970   PetscErrorCode                        ierr;
971 
972   PetscFunctionBegin;
973   /* Analyze the matrix and create the transpose ... on the fly */
974   if (!loTriFactorT && !upTriFactorT) {
975     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
976     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
977     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
978   }
979 
980   /* Get the GPU pointers */
981   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
982   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
983   xGPU = thrust::device_pointer_cast(xarray);
984   bGPU = thrust::device_pointer_cast(barray);
985 
986   /* First, reorder with the row permutation */
987   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
988                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
989                xGPU);
990 
991   /* First, solve U */
992   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
993                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
994                         upTriFactorT->csrMat->values->data().get(),
995                         upTriFactorT->csrMat->row_offsets->data().get(),
996                         upTriFactorT->csrMat->column_indices->data().get(),
997                         upTriFactorT->solveInfo,
998                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
999 
1000   /* Then, solve L */
1001   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1002                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1003                         loTriFactorT->csrMat->values->data().get(),
1004                         loTriFactorT->csrMat->row_offsets->data().get(),
1005                         loTriFactorT->csrMat->column_indices->data().get(),
1006                         loTriFactorT->solveInfo,
1007                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1008 
1009   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1010   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1011                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1012                tempGPU->begin());
1013 
1014   /* Copy the temporary to the full solution. */
1015   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1016 
1017   /* restore */
1018   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1019   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1020   ierr = WaitForGPU();CHKERRCUDA(ierr);
1021 
1022   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1027 {
1028   const PetscScalar                 *barray;
1029   PetscScalar                       *xarray;
1030   cusparseStatus_t                  stat;
1031   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1032   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1033   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1034   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1035   PetscErrorCode                    ierr;
1036 
1037   PetscFunctionBegin;
1038   /* Analyze the matrix and create the transpose ... on the fly */
1039   if (!loTriFactorT && !upTriFactorT) {
1040     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1041     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1042     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1043   }
1044 
1045   /* Get the GPU pointers */
1046   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1047   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1048 
1049   /* First, solve U */
1050   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1051                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1052                         upTriFactorT->csrMat->values->data().get(),
1053                         upTriFactorT->csrMat->row_offsets->data().get(),
1054                         upTriFactorT->csrMat->column_indices->data().get(),
1055                         upTriFactorT->solveInfo,
1056                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1057 
1058   /* Then, solve L */
1059   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1060                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1061                         loTriFactorT->csrMat->values->data().get(),
1062                         loTriFactorT->csrMat->row_offsets->data().get(),
1063                         loTriFactorT->csrMat->column_indices->data().get(),
1064                         loTriFactorT->solveInfo,
1065                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1066 
1067   /* restore */
1068   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1069   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1070   ierr = WaitForGPU();CHKERRCUDA(ierr);
1071   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1072   PetscFunctionReturn(0);
1073 }
1074 
1075 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1076 {
1077   const PetscScalar                     *barray;
1078   PetscScalar                           *xarray;
1079   thrust::device_ptr<const PetscScalar> bGPU;
1080   thrust::device_ptr<PetscScalar>       xGPU;
1081   cusparseStatus_t                      stat;
1082   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1083   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1084   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1085   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1086   PetscErrorCode                        ierr;
1087 
1088   PetscFunctionBegin;
1089 
1090   /* Get the GPU pointers */
1091   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1092   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1093   xGPU = thrust::device_pointer_cast(xarray);
1094   bGPU = thrust::device_pointer_cast(barray);
1095 
1096   /* First, reorder with the row permutation */
1097   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1098                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1099                xGPU);
1100 
1101   /* Next, solve L */
1102   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1103                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1104                         loTriFactor->csrMat->values->data().get(),
1105                         loTriFactor->csrMat->row_offsets->data().get(),
1106                         loTriFactor->csrMat->column_indices->data().get(),
1107                         loTriFactor->solveInfo,
1108                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1109 
1110   /* Then, solve U */
1111   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1112                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1113                         upTriFactor->csrMat->values->data().get(),
1114                         upTriFactor->csrMat->row_offsets->data().get(),
1115                         upTriFactor->csrMat->column_indices->data().get(),
1116                         upTriFactor->solveInfo,
1117                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1118 
1119   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1120   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1121                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1122                tempGPU->begin());
1123 
1124   /* Copy the temporary to the full solution. */
1125   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1126 
1127   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1128   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1129   ierr = WaitForGPU();CHKERRCUDA(ierr);
1130   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1135 {
1136   const PetscScalar                 *barray;
1137   PetscScalar                       *xarray;
1138   cusparseStatus_t                  stat;
1139   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1140   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1141   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1142   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1143   PetscErrorCode                    ierr;
1144 
1145   PetscFunctionBegin;
1146   /* Get the GPU pointers */
1147   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1148   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1149 
1150   /* First, solve L */
1151   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1152                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1153                         loTriFactor->csrMat->values->data().get(),
1154                         loTriFactor->csrMat->row_offsets->data().get(),
1155                         loTriFactor->csrMat->column_indices->data().get(),
1156                         loTriFactor->solveInfo,
1157                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1158 
1159   /* Next, solve U */
1160   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1161                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1162                         upTriFactor->csrMat->values->data().get(),
1163                         upTriFactor->csrMat->row_offsets->data().get(),
1164                         upTriFactor->csrMat->column_indices->data().get(),
1165                         upTriFactor->solveInfo,
1166                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1167 
1168   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1169   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1170   ierr = WaitForGPU();CHKERRCUDA(ierr);
1171   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1176 {
1177 
1178   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1179   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1180   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1181   PetscInt                     m = A->rmap->n,*ii,*ridx;
1182   PetscErrorCode               ierr;
1183   cusparseStatus_t             stat;
1184   cudaError_t                  err;
1185 
1186   PetscFunctionBegin;
1187   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1188     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1189     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1190       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1191       /* copy values only */
1192       matrix->values->assign(a->a, a->a+a->nz);
1193     } else {
1194       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1195       try {
1196         cusparsestruct->nonzerorow=0;
1197         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1198 
1199         if (a->compressedrow.use) {
1200           m    = a->compressedrow.nrows;
1201           ii   = a->compressedrow.i;
1202           ridx = a->compressedrow.rindex;
1203         } else {
1204           /* Forcing compressed row on the GPU */
1205           int k=0;
1206           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1207           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1208           ii[0]=0;
1209           for (int j = 0; j<m; j++) {
1210             if ((a->i[j+1]-a->i[j])>0) {
1211               ii[k]  = a->i[j];
1212               ridx[k]= j;
1213               k++;
1214             }
1215           }
1216           ii[cusparsestruct->nonzerorow] = a->nz;
1217           m = cusparsestruct->nonzerorow;
1218         }
1219 
1220         /* allocate space for the triangular factor information */
1221         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1222         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1223         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1224         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1225 
1226         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1227         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1229         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1231 
1232         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1233         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1234           /* set the matrix */
1235           CsrMatrix *matrix= new CsrMatrix;
1236           matrix->num_rows = m;
1237           matrix->num_cols = A->cmap->n;
1238           matrix->num_entries = a->nz;
1239           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1240           matrix->row_offsets->assign(ii, ii + m+1);
1241 
1242           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1243           matrix->column_indices->assign(a->j, a->j+a->nz);
1244 
1245           matrix->values = new THRUSTARRAY(a->nz);
1246           matrix->values->assign(a->a, a->a+a->nz);
1247 
1248           /* assign the pointer */
1249           matstruct->mat = matrix;
1250 
1251         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1252 #if CUDA_VERSION>=4020
1253           CsrMatrix *matrix= new CsrMatrix;
1254           matrix->num_rows = m;
1255           matrix->num_cols = A->cmap->n;
1256           matrix->num_entries = a->nz;
1257           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1258           matrix->row_offsets->assign(ii, ii + m+1);
1259 
1260           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1261           matrix->column_indices->assign(a->j, a->j+a->nz);
1262 
1263           matrix->values = new THRUSTARRAY(a->nz);
1264           matrix->values->assign(a->a, a->a+a->nz);
1265 
1266           cusparseHybMat_t hybMat;
1267           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1268           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1269             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1270           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1271               matstruct->descr, matrix->values->data().get(),
1272               matrix->row_offsets->data().get(),
1273               matrix->column_indices->data().get(),
1274               hybMat, 0, partition);CHKERRCUDA(stat);
1275           /* assign the pointer */
1276           matstruct->mat = hybMat;
1277 
1278           if (matrix) {
1279             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1280             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1281             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1282             delete (CsrMatrix*)matrix;
1283           }
1284 #endif
1285         }
1286 
1287         /* assign the compressed row indices */
1288         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1289         matstruct->cprowIndices->assign(ridx,ridx+m);
1290 
1291         /* assign the pointer */
1292         cusparsestruct->mat = matstruct;
1293 
1294         if (!a->compressedrow.use) {
1295           ierr = PetscFree(ii);CHKERRQ(ierr);
1296           ierr = PetscFree(ridx);CHKERRQ(ierr);
1297         }
1298         cusparsestruct->workVector = new THRUSTARRAY(m);
1299       } catch(char *ex) {
1300         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1301       }
1302       cusparsestruct->nonzerostate = A->nonzerostate;
1303     }
1304     ierr = WaitForGPU();CHKERRCUDA(ierr);
1305     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1306     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1307   }
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 struct VecCUDAPlusEquals
1312 {
1313   template <typename Tuple>
1314   __host__ __device__
1315   void operator()(Tuple t)
1316   {
1317     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1318   }
1319 };
1320 
1321 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1322 {
1323   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1324   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1325   Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1326   const PetscScalar            *xarray;
1327   PetscScalar                  *yarray;
1328   PetscErrorCode               ierr;
1329   cusparseStatus_t             stat;
1330 
1331   PetscFunctionBegin;
1332   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1333   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1334   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1335   ierr = VecSet(yy,0);CHKERRQ(ierr);
1336   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1337   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1338   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1339     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1340     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1341                              mat->num_rows, mat->num_cols, mat->num_entries,
1342                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1343                              mat->column_indices->data().get(), xarray, matstruct->beta,
1344                              yarray);CHKERRCUDA(stat);
1345   } else {
1346 #if CUDA_VERSION>=4020
1347     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1348     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1349                              matstruct->alpha, matstruct->descr, hybMat,
1350                              xarray, matstruct->beta,
1351                              yarray);CHKERRCUDA(stat);
1352 #endif
1353   }
1354   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1355   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1356   if (!cusparsestruct->stream) {
1357     ierr = WaitForGPU();CHKERRCUDA(ierr);
1358   }
1359   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1364 {
1365   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1366   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1367   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1368   const PetscScalar            *xarray;
1369   PetscScalar                  *yarray;
1370   PetscErrorCode               ierr;
1371   cusparseStatus_t             stat;
1372 
1373   PetscFunctionBegin;
1374   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1375   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1376   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1377   if (!matstructT) {
1378     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1379     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1380   }
1381   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1382   ierr = VecSet(yy,0);CHKERRQ(ierr);
1383   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1384 
1385   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1386     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1387     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1388                              mat->num_rows, mat->num_cols,
1389                              mat->num_entries, matstructT->alpha, matstructT->descr,
1390                              mat->values->data().get(), mat->row_offsets->data().get(),
1391                              mat->column_indices->data().get(), xarray, matstructT->beta,
1392                              yarray);CHKERRCUDA(stat);
1393   } else {
1394 #if CUDA_VERSION>=4020
1395     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1396     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1397                              matstructT->alpha, matstructT->descr, hybMat,
1398                              xarray, matstructT->beta,
1399                              yarray);CHKERRCUDA(stat);
1400 #endif
1401   }
1402   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1403   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1404   if (!cusparsestruct->stream) {
1405     ierr = WaitForGPU();CHKERRCUDA(ierr);
1406   }
1407   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 
1411 
1412 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1413 {
1414   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1415   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1416   Mat_SeqAIJCUSPARSEMultStruct    *matstruct;
1417   thrust::device_ptr<PetscScalar> zptr;
1418   const PetscScalar               *xarray;
1419   PetscScalar                     *zarray;
1420   PetscErrorCode                  ierr;
1421   cusparseStatus_t                stat;
1422 
1423   PetscFunctionBegin;
1424   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1425   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1426   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1427   try {
1428     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1429     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1430     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1431     zptr = thrust::device_pointer_cast(zarray);
1432 
1433     /* multiply add */
1434     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1435       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1436     /* here we need to be careful to set the number of rows in the multiply to the
1437        number of compressed rows in the matrix ... which is equivalent to the
1438        size of the workVector */
1439       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1440                                mat->num_rows, mat->num_cols,
1441                                mat->num_entries, matstruct->alpha, matstruct->descr,
1442                                mat->values->data().get(), mat->row_offsets->data().get(),
1443                                mat->column_indices->data().get(), xarray, matstruct->beta,
1444                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1445     } else {
1446 #if CUDA_VERSION>=4020
1447       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1448       if (cusparsestruct->workVector->size()) {
1449         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1450             matstruct->alpha, matstruct->descr, hybMat,
1451             xarray, matstruct->beta,
1452             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1453       }
1454 #endif
1455     }
1456 
1457     /* scatter the data from the temporary into the full vector with a += operation */
1458     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1459         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1460         VecCUDAPlusEquals());
1461     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1462     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1463 
1464   } catch(char *ex) {
1465     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1466   }
1467   ierr = WaitForGPU();CHKERRCUDA(ierr);
1468   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1473 {
1474   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1475   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1476   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1477   thrust::device_ptr<PetscScalar> zptr;
1478   const PetscScalar               *xarray;
1479   PetscScalar                     *zarray;
1480   PetscErrorCode                  ierr;
1481   cusparseStatus_t                stat;
1482 
1483   PetscFunctionBegin;
1484   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1485   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1486   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1487   if (!matstructT) {
1488     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1489     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1490   }
1491 
1492   try {
1493     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1494     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1495     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1496     zptr = thrust::device_pointer_cast(zarray);
1497 
1498     /* multiply add with matrix transpose */
1499     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1500       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1501       /* here we need to be careful to set the number of rows in the multiply to the
1502          number of compressed rows in the matrix ... which is equivalent to the
1503          size of the workVector */
1504       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1505                                mat->num_rows, mat->num_cols,
1506                                mat->num_entries, matstructT->alpha, matstructT->descr,
1507                                mat->values->data().get(), mat->row_offsets->data().get(),
1508                                mat->column_indices->data().get(), xarray, matstructT->beta,
1509                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1510     } else {
1511 #if CUDA_VERSION>=4020
1512       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1513       if (cusparsestruct->workVector->size()) {
1514         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1515             matstructT->alpha, matstructT->descr, hybMat,
1516             xarray, matstructT->beta,
1517             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1518       }
1519 #endif
1520     }
1521 
1522     /* scatter the data from the temporary into the full vector with a += operation */
1523     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1524         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1525         VecCUDAPlusEquals());
1526 
1527     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1528     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1529 
1530   } catch(char *ex) {
1531     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1532   }
1533   ierr = WaitForGPU();CHKERRCUDA(ierr);
1534   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1539 {
1540   PetscErrorCode ierr;
1541 
1542   PetscFunctionBegin;
1543   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1544   if (A->factortype==MAT_FACTOR_NONE) {
1545     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1546   }
1547   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1548   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1549   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1550   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1551   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 /* --------------------------------------------------------------------------------*/
1556 /*@
1557    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1558    (the default parallel PETSc format). This matrix will ultimately pushed down
1559    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1560    assembly performance the user should preallocate the matrix storage by setting
1561    the parameter nz (or the array nnz).  By setting these parameters accurately,
1562    performance during matrix assembly can be increased by more than a factor of 50.
1563 
1564    Collective on MPI_Comm
1565 
1566    Input Parameters:
1567 +  comm - MPI communicator, set to PETSC_COMM_SELF
1568 .  m - number of rows
1569 .  n - number of columns
1570 .  nz - number of nonzeros per row (same for all rows)
1571 -  nnz - array containing the number of nonzeros in the various rows
1572          (possibly different for each row) or NULL
1573 
1574    Output Parameter:
1575 .  A - the matrix
1576 
1577    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1578    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1579    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1580 
1581    Notes:
1582    If nnz is given then nz is ignored
1583 
1584    The AIJ format (also called the Yale sparse matrix format or
1585    compressed row storage), is fully compatible with standard Fortran 77
1586    storage.  That is, the stored row and column indices can begin at
1587    either one (as in Fortran) or zero.  See the users' manual for details.
1588 
1589    Specify the preallocated storage with either nz or nnz (not both).
1590    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1591    allocation.  For large problems you MUST preallocate memory or you
1592    will get TERRIBLE performance, see the users' manual chapter on matrices.
1593 
1594    By default, this format uses inodes (identical nodes) when possible, to
1595    improve numerical efficiency of matrix-vector products and solves. We
1596    search for consecutive rows with the same nonzero structure, thereby
1597    reusing matrix information to achieve increased efficiency.
1598 
1599    Level: intermediate
1600 
1601 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1602 @*/
1603 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1604 {
1605   PetscErrorCode ierr;
1606 
1607   PetscFunctionBegin;
1608   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1609   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1610   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1611   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1612   PetscFunctionReturn(0);
1613 }
1614 
1615 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1616 {
1617   PetscErrorCode   ierr;
1618 
1619   PetscFunctionBegin;
1620   if (A->factortype==MAT_FACTOR_NONE) {
1621     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1622       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1623     }
1624   } else {
1625     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1626   }
1627   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1632 {
1633   PetscErrorCode ierr;
1634   Mat C;
1635   cusparseStatus_t stat;
1636   cusparseHandle_t handle=0;
1637 
1638   PetscFunctionBegin;
1639   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1640   C    = *B;
1641   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1642   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1643 
1644   /* inject CUSPARSE-specific stuff */
1645   if (C->factortype==MAT_FACTOR_NONE) {
1646     /* you cannot check the inode.use flag here since the matrix was just created.
1647        now build a GPU matrix data structure */
1648     C->spptr = new Mat_SeqAIJCUSPARSE;
1649     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1650     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1651     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1652     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1653     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1654     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1655     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1656     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1657     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1658     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1659   } else {
1660     /* NEXT, set the pointers to the triangular factors */
1661     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1662     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1663     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1664     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1665     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1666     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1667     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1668     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1669     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1670     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1672     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1673   }
1674 
1675   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1676   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1677   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1678   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1679   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1680   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1681   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1682   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1683 
1684   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1685 
1686   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1687 
1688   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1693 {
1694   PetscErrorCode ierr;
1695   cusparseStatus_t stat;
1696   cusparseHandle_t handle=0;
1697 
1698   PetscFunctionBegin;
1699   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1700   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1701   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1702 
1703   if (B->factortype==MAT_FACTOR_NONE) {
1704     /* you cannot check the inode.use flag here since the matrix was just created.
1705        now build a GPU matrix data structure */
1706     B->spptr = new Mat_SeqAIJCUSPARSE;
1707     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1708     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1713     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1714     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1715     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1717   } else {
1718     /* NEXT, set the pointers to the triangular factors */
1719     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1728     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1731   }
1732 
1733   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1734   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1735   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1736   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1737   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1738   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1739   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1740   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1741 
1742   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1743 
1744   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1745 
1746   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1747   PetscFunctionReturn(0);
1748 }
1749 
1750 /*MC
1751    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1752 
1753    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1754    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1755    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1756 
1757    Options Database Keys:
1758 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1759 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1760 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1761 
1762   Level: beginner
1763 
1764 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1765 M*/
1766 
1767 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1768 
1769 
1770 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1771 {
1772   PetscErrorCode ierr;
1773 
1774   PetscFunctionBegin;
1775   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1776   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1777   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1778   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1779   PetscFunctionReturn(0);
1780 }
1781 
1782 
1783 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1784 {
1785   cusparseStatus_t stat;
1786   cusparseHandle_t handle;
1787 
1788   PetscFunctionBegin;
1789   if (*cusparsestruct) {
1790     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1791     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1792     delete (*cusparsestruct)->workVector;
1793     if (handle = (*cusparsestruct)->handle) {
1794       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1795     }
1796     delete *cusparsestruct;
1797     *cusparsestruct = 0;
1798   }
1799   PetscFunctionReturn(0);
1800 }
1801 
1802 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1803 {
1804   PetscFunctionBegin;
1805   if (*mat) {
1806     delete (*mat)->values;
1807     delete (*mat)->column_indices;
1808     delete (*mat)->row_offsets;
1809     delete *mat;
1810     *mat = 0;
1811   }
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1816 {
1817   cusparseStatus_t stat;
1818   PetscErrorCode   ierr;
1819 
1820   PetscFunctionBegin;
1821   if (*trifactor) {
1822     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1823     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1824     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1825     delete *trifactor;
1826     *trifactor = 0;
1827   }
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1832 {
1833   CsrMatrix        *mat;
1834   cusparseStatus_t stat;
1835   cudaError_t      err;
1836 
1837   PetscFunctionBegin;
1838   if (*matstruct) {
1839     if ((*matstruct)->mat) {
1840       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1841         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1842         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1843       } else {
1844         mat = (CsrMatrix*)(*matstruct)->mat;
1845         CsrMatrix_Destroy(&mat);
1846       }
1847     }
1848     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1849     delete (*matstruct)->cprowIndices;
1850     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1851     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1852     delete *matstruct;
1853     *matstruct = 0;
1854   }
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1859 {
1860   cusparseHandle_t handle;
1861   cusparseStatus_t stat;
1862 
1863   PetscFunctionBegin;
1864   if (*trifactors) {
1865     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1866     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1867     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1868     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1869     delete (*trifactors)->rpermIndices;
1870     delete (*trifactors)->cpermIndices;
1871     delete (*trifactors)->workVector;
1872     if (handle = (*trifactors)->handle) {
1873       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1874     }
1875     delete *trifactors;
1876     *trifactors = 0;
1877   }
1878   PetscFunctionReturn(0);
1879 }
1880 
1881