xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a3fdcf43bac02a6759bb705cf4bb50368f8ea553)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127   switch (op) {
128   case MAT_CUSPARSE_MULT:
129     cusparsestruct->format = format;
130     break;
131   case MAT_CUSPARSE_ALL:
132     cusparsestruct->format = format;
133     break;
134   default:
135     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
136   }
137   PetscFunctionReturn(0);
138 }
139 
140 /*@
141    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
142    operation. Only the MatMult operation can use different GPU storage formats
143    for MPIAIJCUSPARSE matrices.
144    Not Collective
145 
146    Input Parameters:
147 +  A - Matrix of type SEQAIJCUSPARSE
148 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
149 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
150 
151    Output Parameter:
152 
153    Level: intermediate
154 
155 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
156 @*/
157 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
158 {
159   PetscErrorCode ierr;
160 
161   PetscFunctionBegin;
162   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
163   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
168 {
169   PetscErrorCode           ierr;
170   MatCUSPARSEStorageFormat format;
171   PetscBool                flg;
172   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
173 
174   PetscFunctionBegin;
175   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
176   if (A->factortype==MAT_FACTOR_NONE) {
177     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
178                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
179     if (flg) {
180       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
181     }
182   }
183   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
184                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185   if (flg) {
186     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
187   }
188   ierr = PetscOptionsTail();CHKERRQ(ierr);
189   PetscFunctionReturn(0);
190 
191 }
192 
193 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
194 {
195   PetscErrorCode ierr;
196 
197   PetscFunctionBegin;
198   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
199   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
204 {
205   PetscErrorCode ierr;
206 
207   PetscFunctionBegin;
208   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
209   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
219   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
224 {
225   PetscErrorCode ierr;
226 
227   PetscFunctionBegin;
228   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
229   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
230   PetscFunctionReturn(0);
231 }
232 
233 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
234 {
235   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
236   PetscInt                          n = A->rmap->n;
237   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
238   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
239   cusparseStatus_t                  stat;
240   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
241   const MatScalar                   *aa = a->a,*v;
242   PetscInt                          *AiLo, *AjLo;
243   PetscScalar                       *AALo;
244   PetscInt                          i,nz, nzLower, offset, rowOffset;
245   PetscErrorCode                    ierr;
246 
247   PetscFunctionBegin;
248   if (!n) PetscFunctionReturn(0);
249   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
250     try {
251       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
252       nzLower=n+ai[n]-ai[1];
253 
254       /* Allocate Space for the lower triangular matrix */
255       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
256       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
257       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
258 
259       /* Fill the lower triangular matrix */
260       AiLo[0]  = (PetscInt) 0;
261       AiLo[n]  = nzLower;
262       AjLo[0]  = (PetscInt) 0;
263       AALo[0]  = (MatScalar) 1.0;
264       v        = aa;
265       vi       = aj;
266       offset   = 1;
267       rowOffset= 1;
268       for (i=1; i<n; i++) {
269         nz = ai[i+1] - ai[i];
270         /* additional 1 for the term on the diagonal */
271         AiLo[i]    = rowOffset;
272         rowOffset += nz+1;
273 
274         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
275         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
276 
277         offset      += nz;
278         AjLo[offset] = (PetscInt) i;
279         AALo[offset] = (MatScalar) 1.0;
280         offset      += 1;
281 
282         v  += nz;
283         vi += nz;
284       }
285 
286       /* allocate space for the triangular factor information */
287       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
288 
289       /* Create the matrix description */
290       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
291       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
292       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
293       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
294       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
295 
296       /* Create the solve analysis information */
297       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
298 
299       /* set the operation */
300       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
301 
302       /* set the matrix */
303       loTriFactor->csrMat = new CsrMatrix;
304       loTriFactor->csrMat->num_rows = n;
305       loTriFactor->csrMat->num_cols = n;
306       loTriFactor->csrMat->num_entries = nzLower;
307 
308       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
309       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
310 
311       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
312       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
313 
314       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
315       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
316 
317       /* perform the solve analysis */
318       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
319                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
320                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
321                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
322 
323       /* assign the pointer. Is this really necessary? */
324       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
325 
326       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
327       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
328       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
329       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
330     } catch(char *ex) {
331       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
332     }
333   }
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
338 {
339   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
340   PetscInt                          n = A->rmap->n;
341   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
342   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
343   cusparseStatus_t                  stat;
344   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
345   const MatScalar                   *aa = a->a,*v;
346   PetscInt                          *AiUp, *AjUp;
347   PetscScalar                       *AAUp;
348   PetscInt                          i,nz, nzUpper, offset;
349   PetscErrorCode                    ierr;
350 
351   PetscFunctionBegin;
352   if (!n) PetscFunctionReturn(0);
353   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
354     try {
355       /* next, figure out the number of nonzeros in the upper triangular matrix. */
356       nzUpper = adiag[0]-adiag[n];
357 
358       /* Allocate Space for the upper triangular matrix */
359       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
360       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
362 
363       /* Fill the upper triangular matrix */
364       AiUp[0]=(PetscInt) 0;
365       AiUp[n]=nzUpper;
366       offset = nzUpper;
367       for (i=n-1; i>=0; i--) {
368         v  = aa + adiag[i+1] + 1;
369         vi = aj + adiag[i+1] + 1;
370 
371         /* number of elements NOT on the diagonal */
372         nz = adiag[i] - adiag[i+1]-1;
373 
374         /* decrement the offset */
375         offset -= (nz+1);
376 
377         /* first, set the diagonal elements */
378         AjUp[offset] = (PetscInt) i;
379         AAUp[offset] = (MatScalar)1./v[nz];
380         AiUp[i]      = AiUp[i+1] - (nz+1);
381 
382         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
383         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
384       }
385 
386       /* allocate space for the triangular factor information */
387       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
388 
389       /* Create the matrix description */
390       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
391       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
392       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
393       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
394       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
395 
396       /* Create the solve analysis information */
397       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
398 
399       /* set the operation */
400       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402       /* set the matrix */
403       upTriFactor->csrMat = new CsrMatrix;
404       upTriFactor->csrMat->num_rows = n;
405       upTriFactor->csrMat->num_cols = n;
406       upTriFactor->csrMat->num_entries = nzUpper;
407 
408       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
410 
411       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
412       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
413 
414       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
415       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
416 
417       /* perform the solve analysis */
418       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
419                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
420                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
421                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
422 
423       /* assign the pointer. Is this really necessary? */
424       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
425 
426       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
427       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
429       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
430     } catch(char *ex) {
431       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
432     }
433   }
434   PetscFunctionReturn(0);
435 }
436 
437 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
438 {
439   PetscErrorCode               ierr;
440   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
441   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
442   IS                           isrow = a->row,iscol = a->icol;
443   PetscBool                    row_identity,col_identity;
444   const PetscInt               *r,*c;
445   PetscInt                     n = A->rmap->n;
446 
447   PetscFunctionBegin;
448   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
449   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
450 
451   cusparseTriFactors->workVector = new THRUSTARRAY(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471 
472   if(!row_identity && !col_identity) {
473     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
474   } else if(!row_identity) {
475     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!col_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   }
479 
480   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
481   PetscFunctionReturn(0);
482 }
483 
484 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
485 {
486   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
487   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
488   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
489   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
490   cusparseStatus_t                  stat;
491   PetscErrorCode                    ierr;
492   PetscInt                          *AiUp, *AjUp;
493   PetscScalar                       *AAUp;
494   PetscScalar                       *AALo;
495   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
496   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
497   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
498   const MatScalar                   *aa = b->a,*v;
499 
500   PetscFunctionBegin;
501   if (!n) PetscFunctionReturn(0);
502   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
503     try {
504       /* Allocate Space for the upper triangular matrix */
505       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
506       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509 
510       /* Fill the upper triangular matrix */
511       AiUp[0]=(PetscInt) 0;
512       AiUp[n]=nzUpper;
513       offset = 0;
514       for (i=0; i<n; i++) {
515         /* set the pointers */
516         v  = aa + ai[i];
517         vj = aj + ai[i];
518         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
519 
520         /* first, set the diagonal elements */
521         AjUp[offset] = (PetscInt) i;
522         AAUp[offset] = (MatScalar)1.0/v[nz];
523         AiUp[i]      = offset;
524         AALo[offset] = (MatScalar)1.0/v[nz];
525 
526         offset+=1;
527         if (nz>0) {
528           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
529           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
530           for (j=offset; j<offset+nz; j++) {
531             AAUp[j] = -AAUp[j];
532             AALo[j] = AAUp[j]/v[nz];
533           }
534           offset+=nz;
535         }
536       }
537 
538       /* allocate space for the triangular factor information */
539       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
540 
541       /* Create the matrix description */
542       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
543       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
544       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
545       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
546       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
547 
548       /* Create the solve analysis information */
549       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
550 
551       /* set the operation */
552       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
553 
554       /* set the matrix */
555       upTriFactor->csrMat = new CsrMatrix;
556       upTriFactor->csrMat->num_rows = A->rmap->n;
557       upTriFactor->csrMat->num_cols = A->cmap->n;
558       upTriFactor->csrMat->num_entries = a->nz;
559 
560       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
561       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
562 
563       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
564       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
565 
566       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
567       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
568 
569       /* perform the solve analysis */
570       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
571                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
572                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
573                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
574 
575       /* assign the pointer. Is this really necessary? */
576       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
577 
578       /* allocate space for the triangular factor information */
579       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
580 
581       /* Create the matrix description */
582       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
583       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
584       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
585       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
586       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
587 
588       /* Create the solve analysis information */
589       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
590 
591       /* set the operation */
592       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
593 
594       /* set the matrix */
595       loTriFactor->csrMat = new CsrMatrix;
596       loTriFactor->csrMat->num_rows = A->rmap->n;
597       loTriFactor->csrMat->num_cols = A->cmap->n;
598       loTriFactor->csrMat->num_entries = a->nz;
599 
600       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
601       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
602 
603       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
604       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
605 
606       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
607       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
608       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
609 
610       /* perform the solve analysis */
611       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
612                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
613                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
614                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
615 
616       /* assign the pointer. Is this really necessary? */
617       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
618 
619       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
620       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
621       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
624     } catch(char *ex) {
625       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
626     }
627   }
628   PetscFunctionReturn(0);
629 }
630 
631 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
632 {
633   PetscErrorCode               ierr;
634   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
635   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
636   IS                           ip = a->row;
637   const PetscInt               *rip;
638   PetscBool                    perm_identity;
639   PetscInt                     n = A->rmap->n;
640 
641   PetscFunctionBegin;
642   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
643   cusparseTriFactors->workVector = new THRUSTARRAY(n);
644   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
645 
646   /*lower triangular indices */
647   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
648   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
649   if (!perm_identity) {
650     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
651     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
652     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
653     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
654     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
655   }
656   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
657   PetscFunctionReturn(0);
658 }
659 
660 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
661 {
662   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
663   IS             isrow = b->row,iscol = b->col;
664   PetscBool      row_identity,col_identity;
665   PetscErrorCode ierr;
666 
667   PetscFunctionBegin;
668   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
669   /* determine which version of MatSolve needs to be used. */
670   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
671   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
672   if (row_identity && col_identity) {
673     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
674     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
675   } else {
676     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
677     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
678   }
679 
680   /* get the triangular factors */
681   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             ip = b->row;
689   PetscBool      perm_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694 
695   /* determine which version of MatSolve needs to be used. */
696   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
697   if (perm_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
711 {
712   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
713   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
716   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
717   cusparseStatus_t                  stat;
718   cusparseIndexBase_t               indexBase;
719   cusparseMatrixType_t              matrixType;
720   cusparseFillMode_t                fillMode;
721   cusparseDiagType_t                diagType;
722 
723   PetscFunctionBegin;
724 
725   /*********************************************/
726   /* Now the Transpose of the Lower Tri Factor */
727   /*********************************************/
728 
729   /* allocate space for the transpose of the lower triangular factor */
730   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
731 
732   /* set the matrix descriptors of the lower triangular factor */
733   matrixType = cusparseGetMatType(loTriFactor->descr);
734   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
735   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
736     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
737   diagType = cusparseGetMatDiagType(loTriFactor->descr);
738 
739   /* Create the matrix description */
740   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
741   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
742   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
743   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
744   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
745 
746   /* Create the solve analysis information */
747   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
748 
749   /* set the operation */
750   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
751 
752   /* allocate GPU space for the CSC of the lower triangular factor*/
753   loTriFactorT->csrMat = new CsrMatrix;
754   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
755   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
756   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
757   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
758   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
759   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
760 
761   /* compute the transpose of the lower triangular factor, i.e. the CSC */
762   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
763                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
764                           loTriFactor->csrMat->values->data().get(),
765                           loTriFactor->csrMat->row_offsets->data().get(),
766                           loTriFactor->csrMat->column_indices->data().get(),
767                           loTriFactorT->csrMat->values->data().get(),
768                           loTriFactorT->csrMat->column_indices->data().get(),
769                           loTriFactorT->csrMat->row_offsets->data().get(),
770                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
771 
772   /* perform the solve analysis on the transposed matrix */
773   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
774                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
775                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
776                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
777                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
778 
779   /* assign the pointer. Is this really necessary? */
780   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
781 
782   /*********************************************/
783   /* Now the Transpose of the Upper Tri Factor */
784   /*********************************************/
785 
786   /* allocate space for the transpose of the upper triangular factor */
787   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
788 
789   /* set the matrix descriptors of the upper triangular factor */
790   matrixType = cusparseGetMatType(upTriFactor->descr);
791   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
792   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
793     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
794   diagType = cusparseGetMatDiagType(upTriFactor->descr);
795 
796   /* Create the matrix description */
797   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
798   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
799   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
800   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
801   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
802 
803   /* Create the solve analysis information */
804   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
805 
806   /* set the operation */
807   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
808 
809   /* allocate GPU space for the CSC of the upper triangular factor*/
810   upTriFactorT->csrMat = new CsrMatrix;
811   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
812   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
813   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
814   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
815   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
816   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
817 
818   /* compute the transpose of the upper triangular factor, i.e. the CSC */
819   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
820                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
821                           upTriFactor->csrMat->values->data().get(),
822                           upTriFactor->csrMat->row_offsets->data().get(),
823                           upTriFactor->csrMat->column_indices->data().get(),
824                           upTriFactorT->csrMat->values->data().get(),
825                           upTriFactorT->csrMat->column_indices->data().get(),
826                           upTriFactorT->csrMat->row_offsets->data().get(),
827                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
828 
829   /* perform the solve analysis on the transposed matrix */
830   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
831                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
832                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
833                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
834                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
835 
836   /* assign the pointer. Is this really necessary? */
837   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
838   PetscFunctionReturn(0);
839 }
840 
841 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
842 {
843   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
844   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
845   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
846   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
847   cusparseStatus_t             stat;
848   cusparseIndexBase_t          indexBase;
849   cudaError_t                  err;
850   PetscErrorCode               ierr;
851 
852   PetscFunctionBegin;
853 
854   /* allocate space for the triangular factor information */
855   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
856   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
857   indexBase = cusparseGetMatIndexBase(matstruct->descr);
858   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
859   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
860 
861   /* set alpha and beta */
862   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
863   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
869 
870   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
871     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
872     CsrMatrix *matrixT= new CsrMatrix;
873     thrust::device_vector<int> *rowoffsets_gpu;
874     const int ncomprow = matstruct->cprowIndices->size();
875     thrust::host_vector<int> cprow(ncomprow);
876     cprow = *matstruct->cprowIndices; // GPU --> CPU
877     matrixT->num_rows = A->cmap->n;
878     matrixT->num_cols = A->rmap->n;
879     matrixT->num_entries = a->nz;
880     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
881     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
882     matrixT->values = new THRUSTARRAY(a->nz);
883     PetscInt k,i;
884     thrust::host_vector<int> rowst_host(A->rmap->n+1);
885 
886     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
887     rowst_host[0] = 0;
888     for (k = 0, i = 0; i < A->rmap->n ; i++) {
889       if (k < ncomprow && i==cprow[k]) {
890 	rowst_host[i+1] = a->i[i+1];
891 	k++;
892       } else rowst_host[i+1] = rowst_host[i];
893     }
894     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
895     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
896     *rowoffsets_gpu = rowst_host; // CPU --> GPU
897 
898     /* compute the transpose of the upper triangular factor, i.e. the CSC */
899     indexBase = cusparseGetMatIndexBase(matstruct->descr);
900     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
901                             A->cmap->n, matrix->num_entries,
902                             matrix->values->data().get(),
903                             rowoffsets_gpu->data().get(),
904                             matrix->column_indices->data().get(),
905                             matrixT->values->data().get(),
906                             matrixT->column_indices->data().get(),
907                             matrixT->row_offsets->data().get(),
908                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
909 
910     /* assign the pointer */
911     matstructT->mat = matrixT;
912     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
913   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
914     /* First convert HYB to CSR */
915     CsrMatrix *temp= new CsrMatrix;
916     temp->num_rows = A->rmap->n;
917     temp->num_cols = A->cmap->n;
918     temp->num_entries = a->nz;
919     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     temp->column_indices = new THRUSTINTARRAY32(a->nz);
921     temp->values = new THRUSTARRAY(a->nz);
922 
923 
924     stat = cusparse_hyb2csr(cusparsestruct->handle,
925                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
926                             temp->values->data().get(),
927                             temp->row_offsets->data().get(),
928                             temp->column_indices->data().get());CHKERRCUDA(stat);
929 
930     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
931     CsrMatrix *tempT= new CsrMatrix;
932     tempT->num_rows = A->rmap->n;
933     tempT->num_cols = A->cmap->n;
934     tempT->num_entries = a->nz;
935     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
937     tempT->values = new THRUSTARRAY(a->nz);
938 
939     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
940                             temp->num_cols, temp->num_entries,
941                             temp->values->data().get(),
942                             temp->row_offsets->data().get(),
943                             temp->column_indices->data().get(),
944                             tempT->values->data().get(),
945                             tempT->column_indices->data().get(),
946                             tempT->row_offsets->data().get(),
947                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
948 
949     /* Last, convert CSC to HYB */
950     cusparseHybMat_t hybMat;
951     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
952     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
953       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
954     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
955                             matstructT->descr, tempT->values->data().get(),
956                             tempT->row_offsets->data().get(),
957                             tempT->column_indices->data().get(),
958                             hybMat, 0, partition);CHKERRCUDA(stat);
959 
960     /* assign the pointer */
961     matstructT->mat = hybMat;
962     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
963 
964     /* delete temporaries */
965     if (tempT) {
966       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
967       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
968       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
969       delete (CsrMatrix*) tempT;
970     }
971     if (temp) {
972       if (temp->values) delete (THRUSTARRAY*) temp->values;
973       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
974       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
975       delete (CsrMatrix*) temp;
976     }
977   }
978   /* assign the compressed row indices */
979   matstructT->cprowIndices = new THRUSTINTARRAY;
980   matstructT->cprowIndices->resize(A->cmap->n);
981   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
982   /* assign the pointer */
983   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
984   PetscFunctionReturn(0);
985 }
986 
987 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
988 {
989   PetscInt                              n = xx->map->n;
990   const PetscScalar                     *barray;
991   PetscScalar                           *xarray;
992   thrust::device_ptr<const PetscScalar> bGPU;
993   thrust::device_ptr<PetscScalar>       xGPU;
994   cusparseStatus_t                      stat;
995   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
997   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
998   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
999   PetscErrorCode                        ierr;
1000 
1001   PetscFunctionBegin;
1002   /* Analyze the matrix and create the transpose ... on the fly */
1003   if (!loTriFactorT && !upTriFactorT) {
1004     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1005     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1006     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1007   }
1008 
1009   /* Get the GPU pointers */
1010   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1011   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1012   xGPU = thrust::device_pointer_cast(xarray);
1013   bGPU = thrust::device_pointer_cast(barray);
1014 
1015   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1016   /* First, reorder with the row permutation */
1017   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1018                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1019                xGPU);
1020 
1021   /* First, solve U */
1022   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1023                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1024                         upTriFactorT->csrMat->values->data().get(),
1025                         upTriFactorT->csrMat->row_offsets->data().get(),
1026                         upTriFactorT->csrMat->column_indices->data().get(),
1027                         upTriFactorT->solveInfo,
1028                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1029 
1030   /* Then, solve L */
1031   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1032                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1033                         loTriFactorT->csrMat->values->data().get(),
1034                         loTriFactorT->csrMat->row_offsets->data().get(),
1035                         loTriFactorT->csrMat->column_indices->data().get(),
1036                         loTriFactorT->solveInfo,
1037                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1038 
1039   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1040   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1041                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1042                tempGPU->begin());
1043 
1044   /* Copy the temporary to the full solution. */
1045   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1046 
1047   /* restore */
1048   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1049   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1050   ierr = WaitForGPU();CHKERRCUDA(ierr);
1051   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1052   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1053   PetscFunctionReturn(0);
1054 }
1055 
1056 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1057 {
1058   const PetscScalar                 *barray;
1059   PetscScalar                       *xarray;
1060   cusparseStatus_t                  stat;
1061   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1062   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1064   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1065   PetscErrorCode                    ierr;
1066 
1067   PetscFunctionBegin;
1068   /* Analyze the matrix and create the transpose ... on the fly */
1069   if (!loTriFactorT && !upTriFactorT) {
1070     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1071     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1072     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1073   }
1074 
1075   /* Get the GPU pointers */
1076   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1077   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1078 
1079   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1080   /* First, solve U */
1081   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1082                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1083                         upTriFactorT->csrMat->values->data().get(),
1084                         upTriFactorT->csrMat->row_offsets->data().get(),
1085                         upTriFactorT->csrMat->column_indices->data().get(),
1086                         upTriFactorT->solveInfo,
1087                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1088 
1089   /* Then, solve L */
1090   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1092                         loTriFactorT->csrMat->values->data().get(),
1093                         loTriFactorT->csrMat->row_offsets->data().get(),
1094                         loTriFactorT->csrMat->column_indices->data().get(),
1095                         loTriFactorT->solveInfo,
1096                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1097 
1098   /* restore */
1099   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1100   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1101   ierr = WaitForGPU();CHKERRCUDA(ierr);
1102   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1103   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1104   PetscFunctionReturn(0);
1105 }
1106 
1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1108 {
1109   const PetscScalar                     *barray;
1110   PetscScalar                           *xarray;
1111   thrust::device_ptr<const PetscScalar> bGPU;
1112   thrust::device_ptr<PetscScalar>       xGPU;
1113   cusparseStatus_t                      stat;
1114   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1115   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1116   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1117   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1118   PetscErrorCode                        ierr;
1119 
1120   PetscFunctionBegin;
1121 
1122   /* Get the GPU pointers */
1123   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1124   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1125   xGPU = thrust::device_pointer_cast(xarray);
1126   bGPU = thrust::device_pointer_cast(barray);
1127 
1128   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1129   /* First, reorder with the row permutation */
1130   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1131                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1132                xGPU);
1133 
1134   /* Next, solve L */
1135   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1136                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1137                         loTriFactor->csrMat->values->data().get(),
1138                         loTriFactor->csrMat->row_offsets->data().get(),
1139                         loTriFactor->csrMat->column_indices->data().get(),
1140                         loTriFactor->solveInfo,
1141                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1142 
1143   /* Then, solve U */
1144   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1145                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1146                         upTriFactor->csrMat->values->data().get(),
1147                         upTriFactor->csrMat->row_offsets->data().get(),
1148                         upTriFactor->csrMat->column_indices->data().get(),
1149                         upTriFactor->solveInfo,
1150                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1151 
1152   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1153   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1154                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1155                tempGPU->begin());
1156 
1157   /* Copy the temporary to the full solution. */
1158   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1159 
1160   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1161   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1162   ierr = WaitForGPU();CHKERRCUDA(ierr);
1163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1164   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1165   PetscFunctionReturn(0);
1166 }
1167 
1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1169 {
1170   const PetscScalar                 *barray;
1171   PetscScalar                       *xarray;
1172   cusparseStatus_t                  stat;
1173   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1174   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1175   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1176   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1177   PetscErrorCode                    ierr;
1178 
1179   PetscFunctionBegin;
1180   /* Get the GPU pointers */
1181   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1182   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1183 
1184   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1185   /* First, solve L */
1186   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1187                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1188                         loTriFactor->csrMat->values->data().get(),
1189                         loTriFactor->csrMat->row_offsets->data().get(),
1190                         loTriFactor->csrMat->column_indices->data().get(),
1191                         loTriFactor->solveInfo,
1192                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1193 
1194   /* Next, solve U */
1195   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1196                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1197                         upTriFactor->csrMat->values->data().get(),
1198                         upTriFactor->csrMat->row_offsets->data().get(),
1199                         upTriFactor->csrMat->column_indices->data().get(),
1200                         upTriFactor->solveInfo,
1201                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1202 
1203   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1204   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1205   ierr = WaitForGPU();CHKERRCUDA(ierr);
1206   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1207   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1212 {
1213   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1215   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1216   PetscInt                     m = A->rmap->n,*ii,*ridx;
1217   PetscErrorCode               ierr;
1218   cusparseStatus_t             stat;
1219   cudaError_t                  err;
1220 
1221   PetscFunctionBegin;
1222   if (A->pinnedtocpu) PetscFunctionReturn(0);
1223   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1224     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1225     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1226       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1227       /* copy values only */
1228       matrix->values->assign(a->a, a->a+a->nz);
1229       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1230     } else {
1231       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1232       try {
1233         cusparsestruct->nonzerorow=0;
1234         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1235 
1236         if (a->compressedrow.use) {
1237           m    = a->compressedrow.nrows;
1238           ii   = a->compressedrow.i;
1239           ridx = a->compressedrow.rindex;
1240         } else {
1241           /* Forcing compressed row on the GPU */
1242           int k=0;
1243           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1244           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1245           ii[0]=0;
1246           for (int j = 0; j<m; j++) {
1247             if ((a->i[j+1]-a->i[j])>0) {
1248               ii[k]  = a->i[j];
1249               ridx[k]= j;
1250               k++;
1251             }
1252           }
1253           ii[cusparsestruct->nonzerorow] = a->nz;
1254           m = cusparsestruct->nonzerorow;
1255         }
1256 
1257         /* allocate space for the triangular factor information */
1258         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1259         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1260         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1261         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1262 
1263         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1264         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1265         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1266         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1267         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1268         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1269         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1270 
1271         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1272         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1273           /* set the matrix */
1274           CsrMatrix *matrix= new CsrMatrix;
1275           matrix->num_rows = m;
1276           matrix->num_cols = A->cmap->n;
1277           matrix->num_entries = a->nz;
1278           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1279           matrix->row_offsets->assign(ii, ii + m+1);
1280 
1281           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1282           matrix->column_indices->assign(a->j, a->j+a->nz);
1283 
1284           matrix->values = new THRUSTARRAY(a->nz);
1285           matrix->values->assign(a->a, a->a+a->nz);
1286 
1287           /* assign the pointer */
1288           matstruct->mat = matrix;
1289 
1290         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1291           CsrMatrix *matrix= new CsrMatrix;
1292           matrix->num_rows = m;
1293           matrix->num_cols = A->cmap->n;
1294           matrix->num_entries = a->nz;
1295           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1296           matrix->row_offsets->assign(ii, ii + m+1);
1297 
1298           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1299           matrix->column_indices->assign(a->j, a->j+a->nz);
1300 
1301           matrix->values = new THRUSTARRAY(a->nz);
1302           matrix->values->assign(a->a, a->a+a->nz);
1303 
1304           cusparseHybMat_t hybMat;
1305           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1306           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1307             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1308           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1309               matstruct->descr, matrix->values->data().get(),
1310               matrix->row_offsets->data().get(),
1311               matrix->column_indices->data().get(),
1312               hybMat, 0, partition);CHKERRCUDA(stat);
1313           /* assign the pointer */
1314           matstruct->mat = hybMat;
1315 
1316           if (matrix) {
1317             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1318             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1319             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1320             delete (CsrMatrix*)matrix;
1321           }
1322         }
1323 
1324         /* assign the compressed row indices */
1325         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1326         matstruct->cprowIndices->assign(ridx,ridx+m);
1327         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1328 
1329         /* assign the pointer */
1330         cusparsestruct->mat = matstruct;
1331 
1332         if (!a->compressedrow.use) {
1333           ierr = PetscFree(ii);CHKERRQ(ierr);
1334           ierr = PetscFree(ridx);CHKERRQ(ierr);
1335         }
1336         cusparsestruct->workVector = new THRUSTARRAY(m);
1337       } catch(char *ex) {
1338         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1339       }
1340       cusparsestruct->nonzerostate = A->nonzerostate;
1341     }
1342     ierr = WaitForGPU();CHKERRCUDA(ierr);
1343     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1344     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1345   }
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 struct VecCUDAPlusEquals
1350 {
1351   template <typename Tuple>
1352   __host__ __device__
1353   void operator()(Tuple t)
1354   {
1355     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1356   }
1357 };
1358 
1359 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1360 {
1361   PetscErrorCode ierr;
1362 
1363   PetscFunctionBegin;
1364   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1369 {
1370   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1371   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1372   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1373   const PetscScalar            *xarray;
1374   PetscScalar                  *yarray;
1375   PetscErrorCode               ierr;
1376   cusparseStatus_t             stat;
1377 
1378   PetscFunctionBegin;
1379   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1380   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1381   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1382   if (!matstructT) {
1383     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1384     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1385   }
1386   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1387   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1388 
1389   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1390   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1391     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1392     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1393                              mat->num_rows, mat->num_cols,
1394                              mat->num_entries, matstructT->alpha, matstructT->descr,
1395                              mat->values->data().get(), mat->row_offsets->data().get(),
1396                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1397                              yarray);CHKERRCUDA(stat);
1398   } else {
1399     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1400     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1401                              matstructT->alpha, matstructT->descr, hybMat,
1402                              xarray, matstructT->beta_zero,
1403                              yarray);CHKERRCUDA(stat);
1404   }
1405   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1406   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1407   ierr = WaitForGPU();CHKERRCUDA(ierr);
1408   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1409   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413 
1414 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1415 {
1416   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1417   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1418   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1419   const PetscScalar            *xarray;
1420   PetscScalar                  *zarray,*dptr,*beta;
1421   PetscErrorCode               ierr;
1422   cusparseStatus_t             stat;
1423   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1424 
1425   PetscFunctionBegin;
1426   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1427   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1428   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1429   try {
1430     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1431     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1432     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1433       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1434     } else {
1435       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1436     }
1437     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1438     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1439 
1440     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1441     /* csr_spmv is multiply add */
1442     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1443       /* here we need to be careful to set the number of rows in the multiply to the
1444          number of compressed rows in the matrix ... which is equivalent to the
1445          size of the workVector */
1446       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1447       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1448                                mat->num_rows, mat->num_cols,
1449                                mat->num_entries, matstruct->alpha, matstruct->descr,
1450                                mat->values->data().get(), mat->row_offsets->data().get(),
1451                                mat->column_indices->data().get(), xarray, beta,
1452                                dptr);CHKERRCUDA(stat);
1453     } else {
1454       if (cusparsestruct->workVector->size()) {
1455 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1456         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1457                                  matstruct->alpha, matstruct->descr, hybMat,
1458                                  xarray, beta,
1459                                  dptr);CHKERRCUDA(stat);
1460       }
1461     }
1462     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1463 
1464     if (yy) {
1465       if (dptr != zarray) {
1466         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1467       } else if (zz != yy) {
1468         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1469       }
1470     } else if (dptr != zarray) {
1471       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1472     }
1473     /* scatter the data from the temporary into the full vector with a += operation */
1474     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1475     if (dptr != zarray) {
1476       thrust::device_ptr<PetscScalar> zptr;
1477 
1478       zptr = thrust::device_pointer_cast(zarray);
1479       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1480                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1481                        VecCUDAPlusEquals());
1482     }
1483     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1485     if (yy && !cmpr) {
1486       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1487     } else {
1488       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1489     }
1490   } catch(char *ex) {
1491     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1492   }
1493   if (!yy) { /* MatMult */
1494     if (!cusparsestruct->stream) {
1495       ierr = WaitForGPU();CHKERRCUDA(ierr);
1496     }
1497   }
1498   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1499   PetscFunctionReturn(0);
1500 }
1501 
1502 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1503 {
1504   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1505   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1506   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1507   const PetscScalar               *xarray;
1508   PetscScalar                     *zarray,*dptr,*beta;
1509   PetscErrorCode                  ierr;
1510   cusparseStatus_t                stat;
1511 
1512   PetscFunctionBegin;
1513   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1514   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1515   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1516   if (!matstructT) {
1517     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1518     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1519   }
1520 
1521   try {
1522     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1523     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1524     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1525     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1526     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1527 
1528     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1529     /* multiply add with matrix transpose */
1530     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1531       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1532       /* here we need to be careful to set the number of rows in the multiply to the
1533          number of compressed rows in the matrix ... which is equivalent to the
1534          size of the workVector */
1535       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1536                                mat->num_rows, mat->num_cols,
1537                                mat->num_entries, matstructT->alpha, matstructT->descr,
1538                                mat->values->data().get(), mat->row_offsets->data().get(),
1539                                mat->column_indices->data().get(), xarray, beta,
1540                                dptr);CHKERRCUDA(stat);
1541     } else {
1542       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1543       if (cusparsestruct->workVector->size()) {
1544         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1545                                  matstructT->alpha, matstructT->descr, hybMat,
1546                                  xarray, beta,
1547                                  dptr);CHKERRCUDA(stat);
1548       }
1549     }
1550     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1551 
1552     if (dptr != zarray) {
1553       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1554     } else if (zz != yy) {
1555       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1556     }
1557     /* scatter the data from the temporary into the full vector with a += operation */
1558     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1559     if (dptr != zarray) {
1560       thrust::device_ptr<PetscScalar> zptr;
1561 
1562       zptr = thrust::device_pointer_cast(zarray);
1563 
1564       /* scatter the data from the temporary into the full vector with a += operation */
1565       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1566                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1567                        VecCUDAPlusEquals());
1568     }
1569     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1570     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1571     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1572   } catch(char *ex) {
1573     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1574   }
1575   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1576   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1581 {
1582   PetscErrorCode ierr;
1583 
1584   PetscFunctionBegin;
1585   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1586   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1587   if (A->factortype == MAT_FACTOR_NONE) {
1588     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1589   }
1590   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1591   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1592   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1593   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 /* --------------------------------------------------------------------------------*/
1598 /*@
1599    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1600    (the default parallel PETSc format). This matrix will ultimately pushed down
1601    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1602    assembly performance the user should preallocate the matrix storage by setting
1603    the parameter nz (or the array nnz).  By setting these parameters accurately,
1604    performance during matrix assembly can be increased by more than a factor of 50.
1605 
1606    Collective
1607 
1608    Input Parameters:
1609 +  comm - MPI communicator, set to PETSC_COMM_SELF
1610 .  m - number of rows
1611 .  n - number of columns
1612 .  nz - number of nonzeros per row (same for all rows)
1613 -  nnz - array containing the number of nonzeros in the various rows
1614          (possibly different for each row) or NULL
1615 
1616    Output Parameter:
1617 .  A - the matrix
1618 
1619    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1620    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1621    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1622 
1623    Notes:
1624    If nnz is given then nz is ignored
1625 
1626    The AIJ format (also called the Yale sparse matrix format or
1627    compressed row storage), is fully compatible with standard Fortran 77
1628    storage.  That is, the stored row and column indices can begin at
1629    either one (as in Fortran) or zero.  See the users' manual for details.
1630 
1631    Specify the preallocated storage with either nz or nnz (not both).
1632    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1633    allocation.  For large problems you MUST preallocate memory or you
1634    will get TERRIBLE performance, see the users' manual chapter on matrices.
1635 
1636    By default, this format uses inodes (identical nodes) when possible, to
1637    improve numerical efficiency of matrix-vector products and solves. We
1638    search for consecutive rows with the same nonzero structure, thereby
1639    reusing matrix information to achieve increased efficiency.
1640 
1641    Level: intermediate
1642 
1643 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1644 @*/
1645 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1646 {
1647   PetscErrorCode ierr;
1648 
1649   PetscFunctionBegin;
1650   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1651   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1652   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1653   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1658 {
1659   PetscErrorCode   ierr;
1660 
1661   PetscFunctionBegin;
1662   if (A->factortype==MAT_FACTOR_NONE) {
1663     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1664       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1665     }
1666   } else {
1667     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1668   }
1669   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1674 {
1675   PetscErrorCode ierr;
1676   Mat C;
1677   cusparseStatus_t stat;
1678   cusparseHandle_t handle=0;
1679 
1680   PetscFunctionBegin;
1681   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1682   C    = *B;
1683   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1684   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1685 
1686   /* inject CUSPARSE-specific stuff */
1687   if (C->factortype==MAT_FACTOR_NONE) {
1688     /* you cannot check the inode.use flag here since the matrix was just created.
1689        now build a GPU matrix data structure */
1690     C->spptr = new Mat_SeqAIJCUSPARSE;
1691     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1692     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1693     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1694     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1695     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1696     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1697     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1698     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1699   } else {
1700     /* NEXT, set the pointers to the triangular factors */
1701     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1702     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1703     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1704     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1705     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1706     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1707     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1708     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1709     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1710     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1711     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1712     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1713   }
1714 
1715   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1716   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1717   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1718   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1719   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1720   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1721   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1722   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1723 
1724   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1725 
1726   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1727 
1728   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1733 {
1734   PetscErrorCode ierr;
1735   cusparseStatus_t stat;
1736   cusparseHandle_t handle=0;
1737 
1738   PetscFunctionBegin;
1739   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1740   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1741 
1742   if (B->factortype==MAT_FACTOR_NONE) {
1743     /* you cannot check the inode.use flag here since the matrix was just created.
1744        now build a GPU matrix data structure */
1745     B->spptr = new Mat_SeqAIJCUSPARSE;
1746     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1747     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1748     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1749     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1750     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1751     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1752     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1753     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1754   } else {
1755     /* NEXT, set the pointers to the triangular factors */
1756     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1757     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1758     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1759     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1760     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1761     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1762     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1763     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1764     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1765     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1766     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1767   }
1768 
1769   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1770   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1771   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1772   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1773   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1774   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1775   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1776   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1777 
1778   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1779 
1780   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1781 
1782   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1783   PetscFunctionReturn(0);
1784 }
1785 
1786 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1787 {
1788   PetscErrorCode ierr;
1789 
1790   PetscFunctionBegin;
1791   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1792   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1793   PetscFunctionReturn(0);
1794 }
1795 
1796 /*MC
1797    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1798 
1799    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1800    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1801    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1802 
1803    Options Database Keys:
1804 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1805 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1806 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1807 
1808   Level: beginner
1809 
1810 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1811 M*/
1812 
1813 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1814 
1815 
1816 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1817 {
1818   PetscErrorCode ierr;
1819 
1820   PetscFunctionBegin;
1821   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1822   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1823   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1824   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1825   PetscFunctionReturn(0);
1826 }
1827 
1828 
1829 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1830 {
1831   cusparseStatus_t stat;
1832   cusparseHandle_t handle;
1833 
1834   PetscFunctionBegin;
1835   if (*cusparsestruct) {
1836     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1837     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1838     delete (*cusparsestruct)->workVector;
1839     if (handle = (*cusparsestruct)->handle) {
1840       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1841     }
1842     delete *cusparsestruct;
1843     *cusparsestruct = 0;
1844   }
1845   PetscFunctionReturn(0);
1846 }
1847 
1848 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1849 {
1850   PetscFunctionBegin;
1851   if (*mat) {
1852     delete (*mat)->values;
1853     delete (*mat)->column_indices;
1854     delete (*mat)->row_offsets;
1855     delete *mat;
1856     *mat = 0;
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1862 {
1863   cusparseStatus_t stat;
1864   PetscErrorCode   ierr;
1865 
1866   PetscFunctionBegin;
1867   if (*trifactor) {
1868     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1869     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1870     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1871     delete *trifactor;
1872     *trifactor = 0;
1873   }
1874   PetscFunctionReturn(0);
1875 }
1876 
1877 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1878 {
1879   CsrMatrix        *mat;
1880   cusparseStatus_t stat;
1881   cudaError_t      err;
1882 
1883   PetscFunctionBegin;
1884   if (*matstruct) {
1885     if ((*matstruct)->mat) {
1886       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1887         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1888         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1889       } else {
1890         mat = (CsrMatrix*)(*matstruct)->mat;
1891         CsrMatrix_Destroy(&mat);
1892       }
1893     }
1894     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1895     delete (*matstruct)->cprowIndices;
1896     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1897     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1898     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1899     delete *matstruct;
1900     *matstruct = 0;
1901   }
1902   PetscFunctionReturn(0);
1903 }
1904 
1905 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1906 {
1907   cusparseHandle_t handle;
1908   cusparseStatus_t stat;
1909 
1910   PetscFunctionBegin;
1911   if (*trifactors) {
1912     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1913     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1914     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1915     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1916     delete (*trifactors)->rpermIndices;
1917     delete (*trifactors)->cpermIndices;
1918     delete (*trifactors)->workVector;
1919     if (handle = (*trifactors)->handle) {
1920       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1921     }
1922     delete *trifactors;
1923     *trifactors = 0;
1924   }
1925   PetscFunctionReturn(0);
1926 }
1927 
1928