xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 53800007fd6c7b4835acc3912dc4427fee16917e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 
16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17 
18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21 
22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25 
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41 
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
54 {
55   cusparseStatus_t   stat;
56   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
57 
58   PetscFunctionBegin;
59   if (cusparsestruct->handle != handle) {
60     if (cusparsestruct->handle) {
61       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
62     }
63     cusparsestruct->handle = handle;
64   }
65   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66   PetscFunctionReturn(0);
67 }
68 
69 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
70 {
71   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72   PetscFunctionBegin;
73   if (cusparsestruct->handle)
74     cusparsestruct->handle = 0;
75   PetscFunctionReturn(0);
76 }
77 
78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
79 {
80   PetscFunctionBegin;
81   *type = MATSOLVERCUSPARSE;
82   PetscFunctionReturn(0);
83 }
84 
85 /*MC
86   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
87   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
88   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
89   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
90   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
91   algorithms are not recommended. This class does NOT support direct solver operations.
92 
93   Level: beginner
94 
95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
96 M*/
97 
98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
99 {
100   PetscErrorCode ierr;
101   PetscInt       n = A->rmap->n;
102 
103   PetscFunctionBegin;
104   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
105   (*B)->factortype = ftype;
106   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
107   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
108 
109   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
110     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
111     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
112     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
113   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
114     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
115     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
116   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117 
118   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
119   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126 
127   PetscFunctionBegin;
128 #if CUDA_VERSION>=4020
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139 #else
140   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
141 #endif
142   PetscFunctionReturn(0);
143 }
144 
145 /*@
146    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
147    operation. Only the MatMult operation can use different GPU storage formats
148    for MPIAIJCUSPARSE matrices.
149    Not Collective
150 
151    Input Parameters:
152 +  A - Matrix of type SEQAIJCUSPARSE
153 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
154 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
155 
156    Output Parameter:
157 
158    Level: intermediate
159 
160 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
161 @*/
162 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
163 {
164   PetscErrorCode ierr;
165 
166   PetscFunctionBegin;
167   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
168   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
169   PetscFunctionReturn(0);
170 }
171 
172 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
173 {
174   PetscErrorCode           ierr;
175   MatCUSPARSEStorageFormat format;
176   PetscBool                flg;
177   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
178 
179   PetscFunctionBegin;
180   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
181   if (A->factortype==MAT_FACTOR_NONE) {
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
186     }
187   }
188   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190   if (flg) {
191     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
192   }
193   ierr = PetscOptionsTail();CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 
196 }
197 
198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199 {
200   PetscErrorCode ierr;
201 
202   PetscFunctionBegin;
203   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
204   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
214   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215   PetscFunctionReturn(0);
216 }
217 
218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
224   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
234   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239 {
240   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
241   PetscInt                          n = A->rmap->n;
242   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244   cusparseStatus_t                  stat;
245   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
246   const MatScalar                   *aa = a->a,*v;
247   PetscInt                          *AiLo, *AjLo;
248   PetscScalar                       *AALo;
249   PetscInt                          i,nz, nzLower, offset, rowOffset;
250   PetscErrorCode                    ierr;
251 
252   PetscFunctionBegin;
253   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
254     try {
255       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256       nzLower=n+ai[n]-ai[1];
257 
258       /* Allocate Space for the lower triangular matrix */
259       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
262 
263       /* Fill the lower triangular matrix */
264       AiLo[0]  = (PetscInt) 0;
265       AiLo[n]  = nzLower;
266       AjLo[0]  = (PetscInt) 0;
267       AALo[0]  = (MatScalar) 1.0;
268       v        = aa;
269       vi       = aj;
270       offset   = 1;
271       rowOffset= 1;
272       for (i=1; i<n; i++) {
273         nz = ai[i+1] - ai[i];
274         /* additional 1 for the term on the diagonal */
275         AiLo[i]    = rowOffset;
276         rowOffset += nz+1;
277 
278         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
279         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
280 
281         offset      += nz;
282         AjLo[offset] = (PetscInt) i;
283         AALo[offset] = (MatScalar) 1.0;
284         offset      += 1;
285 
286         v  += nz;
287         vi += nz;
288       }
289 
290       /* allocate space for the triangular factor information */
291       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
292 
293       /* Create the matrix description */
294       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
299 
300       /* Create the solve analysis information */
301       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
302 
303       /* set the operation */
304       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
305 
306       /* set the matrix */
307       loTriFactor->csrMat = new CsrMatrix;
308       loTriFactor->csrMat->num_rows = n;
309       loTriFactor->csrMat->num_cols = n;
310       loTriFactor->csrMat->num_entries = nzLower;
311 
312       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
314 
315       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
317 
318       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
320 
321       /* perform the solve analysis */
322       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* assign the pointer. Is this really necessary? */
328       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
329 
330       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
333       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
334     } catch(char *ex) {
335       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
336     }
337   }
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
342 {
343   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
344   PetscInt                          n = A->rmap->n;
345   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
346   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
347   cusparseStatus_t                  stat;
348   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
349   const MatScalar                   *aa = a->a,*v;
350   PetscInt                          *AiUp, *AjUp;
351   PetscScalar                       *AAUp;
352   PetscInt                          i,nz, nzUpper, offset;
353   PetscErrorCode                    ierr;
354 
355   PetscFunctionBegin;
356   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
357     try {
358       /* next, figure out the number of nonzeros in the upper triangular matrix. */
359       nzUpper = adiag[0]-adiag[n];
360 
361       /* Allocate Space for the upper triangular matrix */
362       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
364       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
365 
366       /* Fill the upper triangular matrix */
367       AiUp[0]=(PetscInt) 0;
368       AiUp[n]=nzUpper;
369       offset = nzUpper;
370       for (i=n-1; i>=0; i--) {
371         v  = aa + adiag[i+1] + 1;
372         vi = aj + adiag[i+1] + 1;
373 
374         /* number of elements NOT on the diagonal */
375         nz = adiag[i] - adiag[i+1]-1;
376 
377         /* decrement the offset */
378         offset -= (nz+1);
379 
380         /* first, set the diagonal elements */
381         AjUp[offset] = (PetscInt) i;
382         AAUp[offset] = (MatScalar)1./v[nz];
383         AiUp[i]      = AiUp[i+1] - (nz+1);
384 
385         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
386         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
387       }
388 
389       /* allocate space for the triangular factor information */
390       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391 
392       /* Create the matrix description */
393       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
394       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
395       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
396       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
397       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
398 
399       /* Create the solve analysis information */
400       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
401 
402       /* set the operation */
403       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
404 
405       /* set the matrix */
406       upTriFactor->csrMat = new CsrMatrix;
407       upTriFactor->csrMat->num_rows = n;
408       upTriFactor->csrMat->num_cols = n;
409       upTriFactor->csrMat->num_entries = nzUpper;
410 
411       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
412       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
413 
414       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
415       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
416 
417       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
418       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
419 
420       /* perform the solve analysis */
421       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
422                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
423                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
424                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
425 
426       /* assign the pointer. Is this really necessary? */
427       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
428 
429       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
431       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
432       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
433     } catch(char *ex) {
434       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
435     }
436   }
437   PetscFunctionReturn(0);
438 }
439 
440 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
441 {
442   PetscErrorCode               ierr;
443   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
444   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
445   IS                           isrow = a->row,iscol = a->icol;
446   PetscBool                    row_identity,col_identity;
447   const PetscInt               *r,*c;
448   PetscInt                     n = A->rmap->n;
449 
450   PetscFunctionBegin;
451   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
452   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
453 
454   cusparseTriFactors->workVector = new THRUSTARRAY(n);
455   cusparseTriFactors->nnz=a->nz;
456 
457   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
458   /*lower triangular indices */
459   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
460   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
461   if (!row_identity) {
462     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
463     cusparseTriFactors->rpermIndices->assign(r, r+n);
464   }
465   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
466 
467   /*upper triangular indices */
468   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
469   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
470   if (!col_identity) {
471     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
472     cusparseTriFactors->cpermIndices->assign(c, c+n);
473   }
474 
475   if(!row_identity && !col_identity) {
476     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
477   } else if(!row_identity) {
478     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
479   } else if(!col_identity) {
480     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
481   }
482 
483   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
487 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
488 {
489   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
490   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
492   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
493   cusparseStatus_t                  stat;
494   PetscErrorCode                    ierr;
495   PetscInt                          *AiUp, *AjUp;
496   PetscScalar                       *AAUp;
497   PetscScalar                       *AALo;
498   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
499   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
500   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
501   const MatScalar                   *aa = b->a,*v;
502 
503   PetscFunctionBegin;
504   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
505     try {
506       /* Allocate Space for the upper triangular matrix */
507       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
511 
512       /* Fill the upper triangular matrix */
513       AiUp[0]=(PetscInt) 0;
514       AiUp[n]=nzUpper;
515       offset = 0;
516       for (i=0; i<n; i++) {
517         /* set the pointers */
518         v  = aa + ai[i];
519         vj = aj + ai[i];
520         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
521 
522         /* first, set the diagonal elements */
523         AjUp[offset] = (PetscInt) i;
524         AAUp[offset] = (MatScalar)1.0/v[nz];
525         AiUp[i]      = offset;
526         AALo[offset] = (MatScalar)1.0/v[nz];
527 
528         offset+=1;
529         if (nz>0) {
530           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
531           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
532           for (j=offset; j<offset+nz; j++) {
533             AAUp[j] = -AAUp[j];
534             AALo[j] = AAUp[j]/v[nz];
535           }
536           offset+=nz;
537         }
538       }
539 
540       /* allocate space for the triangular factor information */
541       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
542 
543       /* Create the matrix description */
544       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
549 
550       /* Create the solve analysis information */
551       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
552 
553       /* set the operation */
554       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
555 
556       /* set the matrix */
557       upTriFactor->csrMat = new CsrMatrix;
558       upTriFactor->csrMat->num_rows = A->rmap->n;
559       upTriFactor->csrMat->num_cols = A->cmap->n;
560       upTriFactor->csrMat->num_entries = a->nz;
561 
562       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
564 
565       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
567 
568       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
570 
571       /* perform the solve analysis */
572       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
576 
577       /* assign the pointer. Is this really necessary? */
578       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
579 
580       /* allocate space for the triangular factor information */
581       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
582 
583       /* Create the matrix description */
584       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
589 
590       /* Create the solve analysis information */
591       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
592 
593       /* set the operation */
594       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
595 
596       /* set the matrix */
597       loTriFactor->csrMat = new CsrMatrix;
598       loTriFactor->csrMat->num_rows = A->rmap->n;
599       loTriFactor->csrMat->num_cols = A->cmap->n;
600       loTriFactor->csrMat->num_entries = a->nz;
601 
602       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
604 
605       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
607 
608       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
611 
612       /* perform the solve analysis */
613       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
617 
618       /* assign the pointer. Is this really necessary? */
619       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
620 
621       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
622       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           ip = a->row;
639   const PetscInt               *rip;
640   PetscBool                    perm_identity;
641   PetscInt                     n = A->rmap->n;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
645   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
647 
648   /*lower triangular indices */
649   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
650   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
651   if (!perm_identity) {
652     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
653     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
654     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
655     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
656     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
657   }
658   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
659   PetscFunctionReturn(0);
660 }
661 
662 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
663 {
664   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
665   IS             isrow = b->row,iscol = b->col;
666   PetscBool      row_identity,col_identity;
667   PetscErrorCode ierr;
668 
669   PetscFunctionBegin;
670   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
671   /* determine which version of MatSolve needs to be used. */
672   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
673   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
674   if (row_identity && col_identity) {
675     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
676     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
677   } else {
678     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
679     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
680   }
681 
682   /* get the triangular factors */
683   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
687 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
688 {
689   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
690   IS             ip = b->row;
691   PetscBool      perm_identity;
692   PetscErrorCode ierr;
693 
694   PetscFunctionBegin;
695   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
696 
697   /* determine which version of MatSolve needs to be used. */
698   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
699   if (perm_identity) {
700     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
701     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
702   } else {
703     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
704     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
705   }
706 
707   /* get the triangular factors */
708   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
713 {
714   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
717   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
718   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
719   cusparseStatus_t                  stat;
720   cusparseIndexBase_t               indexBase;
721   cusparseMatrixType_t              matrixType;
722   cusparseFillMode_t                fillMode;
723   cusparseDiagType_t                diagType;
724 
725   PetscFunctionBegin;
726 
727   /*********************************************/
728   /* Now the Transpose of the Lower Tri Factor */
729   /*********************************************/
730 
731   /* allocate space for the transpose of the lower triangular factor */
732   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
733 
734   /* set the matrix descriptors of the lower triangular factor */
735   matrixType = cusparseGetMatType(loTriFactor->descr);
736   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
737   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
738     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
739   diagType = cusparseGetMatDiagType(loTriFactor->descr);
740 
741   /* Create the matrix description */
742   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
743   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
744   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
745   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
746   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
747 
748   /* Create the solve analysis information */
749   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
750 
751   /* set the operation */
752   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
753 
754   /* allocate GPU space for the CSC of the lower triangular factor*/
755   loTriFactorT->csrMat = new CsrMatrix;
756   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
757   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
758   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
759   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
760   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
761   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
762 
763   /* compute the transpose of the lower triangular factor, i.e. the CSC */
764   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
765                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
766                           loTriFactor->csrMat->values->data().get(),
767                           loTriFactor->csrMat->row_offsets->data().get(),
768                           loTriFactor->csrMat->column_indices->data().get(),
769                           loTriFactorT->csrMat->values->data().get(),
770                           loTriFactorT->csrMat->column_indices->data().get(),
771                           loTriFactorT->csrMat->row_offsets->data().get(),
772                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
773 
774   /* perform the solve analysis on the transposed matrix */
775   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
776                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
777                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
778                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
779                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
780 
781   /* assign the pointer. Is this really necessary? */
782   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
783 
784   /*********************************************/
785   /* Now the Transpose of the Upper Tri Factor */
786   /*********************************************/
787 
788   /* allocate space for the transpose of the upper triangular factor */
789   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
790 
791   /* set the matrix descriptors of the upper triangular factor */
792   matrixType = cusparseGetMatType(upTriFactor->descr);
793   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
794   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
795     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
796   diagType = cusparseGetMatDiagType(upTriFactor->descr);
797 
798   /* Create the matrix description */
799   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
800   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
801   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
802   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
803   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
804 
805   /* Create the solve analysis information */
806   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
807 
808   /* set the operation */
809   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
810 
811   /* allocate GPU space for the CSC of the upper triangular factor*/
812   upTriFactorT->csrMat = new CsrMatrix;
813   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
814   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
815   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
816   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
817   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
818   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
819 
820   /* compute the transpose of the upper triangular factor, i.e. the CSC */
821   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
822                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
823                           upTriFactor->csrMat->values->data().get(),
824                           upTriFactor->csrMat->row_offsets->data().get(),
825                           upTriFactor->csrMat->column_indices->data().get(),
826                           upTriFactorT->csrMat->values->data().get(),
827                           upTriFactorT->csrMat->column_indices->data().get(),
828                           upTriFactorT->csrMat->row_offsets->data().get(),
829                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
830 
831   /* perform the solve analysis on the transposed matrix */
832   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
833                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
834                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
835                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
836                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
837 
838   /* assign the pointer. Is this really necessary? */
839   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
840   PetscFunctionReturn(0);
841 }
842 
843 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
844 {
845   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
846   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
847   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
848   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
849   cusparseStatus_t             stat;
850   cusparseIndexBase_t          indexBase;
851   cudaError_t                  err;
852   PetscErrorCode               ierr;
853 
854   PetscFunctionBegin;
855 
856   /* allocate space for the triangular factor information */
857   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
858   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
859   indexBase = cusparseGetMatIndexBase(matstruct->descr);
860   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
861   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
862 
863   /* set alpha and beta */
864   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
870   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
871 
872   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
873     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
874     CsrMatrix *matrixT= new CsrMatrix;
875     matrixT->num_rows = A->cmap->n;
876     matrixT->num_cols = A->rmap->n;
877     matrixT->num_entries = a->nz;
878     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
879     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
880     matrixT->values = new THRUSTARRAY(a->nz);
881 
882     /* compute the transpose of the upper triangular factor, i.e. the CSC */
883     indexBase = cusparseGetMatIndexBase(matstruct->descr);
884     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
885                             matrix->num_cols, matrix->num_entries,
886                             matrix->values->data().get(),
887                             matrix->row_offsets->data().get(),
888                             matrix->column_indices->data().get(),
889                             matrixT->values->data().get(),
890                             matrixT->column_indices->data().get(),
891                             matrixT->row_offsets->data().get(),
892                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
893 
894     /* assign the pointer */
895     matstructT->mat = matrixT;
896     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
897   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
898 #if CUDA_VERSION>=5000
899     /* First convert HYB to CSR */
900     CsrMatrix *temp= new CsrMatrix;
901     temp->num_rows = A->rmap->n;
902     temp->num_cols = A->cmap->n;
903     temp->num_entries = a->nz;
904     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     temp->column_indices = new THRUSTINTARRAY32(a->nz);
906     temp->values = new THRUSTARRAY(a->nz);
907 
908 
909     stat = cusparse_hyb2csr(cusparsestruct->handle,
910                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
911                             temp->values->data().get(),
912                             temp->row_offsets->data().get(),
913                             temp->column_indices->data().get());CHKERRCUDA(stat);
914 
915     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
916     CsrMatrix *tempT= new CsrMatrix;
917     tempT->num_rows = A->rmap->n;
918     tempT->num_cols = A->cmap->n;
919     tempT->num_entries = a->nz;
920     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
921     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
922     tempT->values = new THRUSTARRAY(a->nz);
923 
924     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
925                             temp->num_cols, temp->num_entries,
926                             temp->values->data().get(),
927                             temp->row_offsets->data().get(),
928                             temp->column_indices->data().get(),
929                             tempT->values->data().get(),
930                             tempT->column_indices->data().get(),
931                             tempT->row_offsets->data().get(),
932                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
933 
934     /* Last, convert CSC to HYB */
935     cusparseHybMat_t hybMat;
936     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
937     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
938       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
939     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
940                             matstructT->descr, tempT->values->data().get(),
941                             tempT->row_offsets->data().get(),
942                             tempT->column_indices->data().get(),
943                             hybMat, 0, partition);CHKERRCUDA(stat);
944 
945     /* assign the pointer */
946     matstructT->mat = hybMat;
947     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
948 
949     /* delete temporaries */
950     if (tempT) {
951       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
952       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
953       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
954       delete (CsrMatrix*) tempT;
955     }
956     if (temp) {
957       if (temp->values) delete (THRUSTARRAY*) temp->values;
958       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
959       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
960       delete (CsrMatrix*) temp;
961     }
962 #else
963     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
964 #endif
965   }
966   /* assign the compressed row indices */
967   matstructT->cprowIndices = new THRUSTINTARRAY;
968   matstructT->cprowIndices->resize(A->cmap->n);
969   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
970   /* assign the pointer */
971   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
972   PetscFunctionReturn(0);
973 }
974 
975 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
976 {
977   PetscInt                              n = xx->map->n;
978   const PetscScalar                     *barray;
979   PetscScalar                           *xarray;
980   thrust::device_ptr<const PetscScalar> bGPU;
981   thrust::device_ptr<PetscScalar>       xGPU;
982   cusparseStatus_t                      stat;
983   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
985   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
986   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
987   PetscErrorCode                        ierr;
988 
989   PetscFunctionBegin;
990   /* Analyze the matrix and create the transpose ... on the fly */
991   if (!loTriFactorT && !upTriFactorT) {
992     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
993     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
994     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
995   }
996 
997   /* Get the GPU pointers */
998   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
999   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1000   xGPU = thrust::device_pointer_cast(xarray);
1001   bGPU = thrust::device_pointer_cast(barray);
1002 
1003   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1004   /* First, reorder with the row permutation */
1005   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1006                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1007                xGPU);
1008 
1009   /* First, solve U */
1010   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1011                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1012                         upTriFactorT->csrMat->values->data().get(),
1013                         upTriFactorT->csrMat->row_offsets->data().get(),
1014                         upTriFactorT->csrMat->column_indices->data().get(),
1015                         upTriFactorT->solveInfo,
1016                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1017 
1018   /* Then, solve L */
1019   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1020                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1021                         loTriFactorT->csrMat->values->data().get(),
1022                         loTriFactorT->csrMat->row_offsets->data().get(),
1023                         loTriFactorT->csrMat->column_indices->data().get(),
1024                         loTriFactorT->solveInfo,
1025                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1026 
1027   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1028   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1029                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1030                tempGPU->begin());
1031 
1032   /* Copy the temporary to the full solution. */
1033   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1034   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1035 
1036   /* restore */
1037   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1038   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1039   ierr = WaitForGPU();CHKERRCUDA(ierr);
1040   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1041   PetscFunctionReturn(0);
1042 }
1043 
1044 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1045 {
1046   const PetscScalar                 *barray;
1047   PetscScalar                       *xarray;
1048   cusparseStatus_t                  stat;
1049   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1051   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1052   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1053   PetscErrorCode                    ierr;
1054 
1055   PetscFunctionBegin;
1056   /* Analyze the matrix and create the transpose ... on the fly */
1057   if (!loTriFactorT && !upTriFactorT) {
1058     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1059     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1060     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1061   }
1062 
1063   /* Get the GPU pointers */
1064   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1065   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1066 
1067   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1068   /* First, solve U */
1069   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1070                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1071                         upTriFactorT->csrMat->values->data().get(),
1072                         upTriFactorT->csrMat->row_offsets->data().get(),
1073                         upTriFactorT->csrMat->column_indices->data().get(),
1074                         upTriFactorT->solveInfo,
1075                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1076 
1077   /* Then, solve L */
1078   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1080                         loTriFactorT->csrMat->values->data().get(),
1081                         loTriFactorT->csrMat->row_offsets->data().get(),
1082                         loTriFactorT->csrMat->column_indices->data().get(),
1083                         loTriFactorT->solveInfo,
1084                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1085   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1086 
1087   /* restore */
1088   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1089   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1090   ierr = WaitForGPU();CHKERRCUDA(ierr);
1091   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1096 {
1097   const PetscScalar                     *barray;
1098   PetscScalar                           *xarray;
1099   thrust::device_ptr<const PetscScalar> bGPU;
1100   thrust::device_ptr<PetscScalar>       xGPU;
1101   cusparseStatus_t                      stat;
1102   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1103   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1104   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1105   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1106   PetscErrorCode                        ierr;
1107 
1108   PetscFunctionBegin;
1109 
1110   /* Get the GPU pointers */
1111   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1112   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1113   xGPU = thrust::device_pointer_cast(xarray);
1114   bGPU = thrust::device_pointer_cast(barray);
1115 
1116   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1117   /* First, reorder with the row permutation */
1118   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1119                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1120                xGPU);
1121 
1122   /* Next, solve L */
1123   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1124                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1125                         loTriFactor->csrMat->values->data().get(),
1126                         loTriFactor->csrMat->row_offsets->data().get(),
1127                         loTriFactor->csrMat->column_indices->data().get(),
1128                         loTriFactor->solveInfo,
1129                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1130 
1131   /* Then, solve U */
1132   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1133                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1134                         upTriFactor->csrMat->values->data().get(),
1135                         upTriFactor->csrMat->row_offsets->data().get(),
1136                         upTriFactor->csrMat->column_indices->data().get(),
1137                         upTriFactor->solveInfo,
1138                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1139   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1140 
1141   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1142   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1143                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1144                tempGPU->begin());
1145 
1146   /* Copy the temporary to the full solution. */
1147   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1148 
1149   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1150   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1151   ierr = WaitForGPU();CHKERRCUDA(ierr);
1152   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1153   PetscFunctionReturn(0);
1154 }
1155 
1156 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1157 {
1158   const PetscScalar                 *barray;
1159   PetscScalar                       *xarray;
1160   cusparseStatus_t                  stat;
1161   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1162   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1163   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1164   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1165   PetscErrorCode                    ierr;
1166 
1167   PetscFunctionBegin;
1168   /* Get the GPU pointers */
1169   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1170   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1171 
1172   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1173   /* First, solve L */
1174   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1175                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1176                         loTriFactor->csrMat->values->data().get(),
1177                         loTriFactor->csrMat->row_offsets->data().get(),
1178                         loTriFactor->csrMat->column_indices->data().get(),
1179                         loTriFactor->solveInfo,
1180                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1181 
1182   /* Next, solve U */
1183   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1184                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1185                         upTriFactor->csrMat->values->data().get(),
1186                         upTriFactor->csrMat->row_offsets->data().get(),
1187                         upTriFactor->csrMat->column_indices->data().get(),
1188                         upTriFactor->solveInfo,
1189                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1190   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1191 
1192   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1193   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1194   ierr = WaitForGPU();CHKERRCUDA(ierr);
1195   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1200 {
1201 
1202   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1204   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205   PetscInt                     m = A->rmap->n,*ii,*ridx;
1206   PetscErrorCode               ierr;
1207   cusparseStatus_t             stat;
1208   cudaError_t                  err;
1209 
1210   PetscFunctionBegin;
1211   if (A->pinnedtocpu) PetscFunctionReturn(0);
1212   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1213     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1214     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1215       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1216       /* copy values only */
1217       matrix->values->assign(a->a, a->a+a->nz);
1218       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1219     } else {
1220       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1221       try {
1222         cusparsestruct->nonzerorow=0;
1223         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1224 
1225         if (a->compressedrow.use) {
1226           m    = a->compressedrow.nrows;
1227           ii   = a->compressedrow.i;
1228           ridx = a->compressedrow.rindex;
1229         } else {
1230           /* Forcing compressed row on the GPU */
1231           int k=0;
1232           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1233           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1234           ii[0]=0;
1235           for (int j = 0; j<m; j++) {
1236             if ((a->i[j+1]-a->i[j])>0) {
1237               ii[k]  = a->i[j];
1238               ridx[k]= j;
1239               k++;
1240             }
1241           }
1242           ii[cusparsestruct->nonzerorow] = a->nz;
1243           m = cusparsestruct->nonzerorow;
1244         }
1245 
1246         /* allocate space for the triangular factor information */
1247         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1248         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1249         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1250         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1251 
1252         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1253         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1254         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1255         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1256         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1257         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1258         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1259 
1260         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1261         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1262           /* set the matrix */
1263           CsrMatrix *matrix= new CsrMatrix;
1264           matrix->num_rows = m;
1265           matrix->num_cols = A->cmap->n;
1266           matrix->num_entries = a->nz;
1267           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1268           matrix->row_offsets->assign(ii, ii + m+1);
1269 
1270           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1271           matrix->column_indices->assign(a->j, a->j+a->nz);
1272 
1273           matrix->values = new THRUSTARRAY(a->nz);
1274           matrix->values->assign(a->a, a->a+a->nz);
1275 
1276           /* assign the pointer */
1277           matstruct->mat = matrix;
1278 
1279         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1280 #if CUDA_VERSION>=4020
1281           CsrMatrix *matrix= new CsrMatrix;
1282           matrix->num_rows = m;
1283           matrix->num_cols = A->cmap->n;
1284           matrix->num_entries = a->nz;
1285           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1286           matrix->row_offsets->assign(ii, ii + m+1);
1287 
1288           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1289           matrix->column_indices->assign(a->j, a->j+a->nz);
1290 
1291           matrix->values = new THRUSTARRAY(a->nz);
1292           matrix->values->assign(a->a, a->a+a->nz);
1293 
1294           cusparseHybMat_t hybMat;
1295           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1296           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1297             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1298           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1299               matstruct->descr, matrix->values->data().get(),
1300               matrix->row_offsets->data().get(),
1301               matrix->column_indices->data().get(),
1302               hybMat, 0, partition);CHKERRCUDA(stat);
1303           /* assign the pointer */
1304           matstruct->mat = hybMat;
1305 
1306           if (matrix) {
1307             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1308             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1309             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1310             delete (CsrMatrix*)matrix;
1311           }
1312 #endif
1313         }
1314 
1315         /* assign the compressed row indices */
1316         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1317         matstruct->cprowIndices->assign(ridx,ridx+m);
1318         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1319 
1320         /* assign the pointer */
1321         cusparsestruct->mat = matstruct;
1322 
1323         if (!a->compressedrow.use) {
1324           ierr = PetscFree(ii);CHKERRQ(ierr);
1325           ierr = PetscFree(ridx);CHKERRQ(ierr);
1326         }
1327         cusparsestruct->workVector = new THRUSTARRAY(m);
1328       } catch(char *ex) {
1329         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1330       }
1331       cusparsestruct->nonzerostate = A->nonzerostate;
1332     }
1333     ierr = WaitForGPU();CHKERRCUDA(ierr);
1334     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1335     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1336   }
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 struct VecCUDAPlusEquals
1341 {
1342   template <typename Tuple>
1343   __host__ __device__
1344   void operator()(Tuple t)
1345   {
1346     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1347   }
1348 };
1349 
1350 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1351 {
1352   PetscErrorCode ierr;
1353 
1354   PetscFunctionBegin;
1355   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1360 {
1361   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1362   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1363   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1364   const PetscScalar            *xarray;
1365   PetscScalar                  *yarray;
1366   PetscErrorCode               ierr;
1367   cusparseStatus_t             stat;
1368 
1369   PetscFunctionBegin;
1370   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1371   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1372   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1373   if (!matstructT) {
1374     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1375     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1376   }
1377   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1378   ierr = VecSet(yy,0);CHKERRQ(ierr);
1379   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1380 
1381   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1382   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1383     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1384     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1385                              mat->num_rows, mat->num_cols,
1386                              mat->num_entries, matstructT->alpha, matstructT->descr,
1387                              mat->values->data().get(), mat->row_offsets->data().get(),
1388                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1389                              yarray);CHKERRCUDA(stat);
1390   } else {
1391 #if CUDA_VERSION>=4020
1392     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1393     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1394                              matstructT->alpha, matstructT->descr, hybMat,
1395                              xarray, matstructT->beta_zero,
1396                              yarray);CHKERRCUDA(stat);
1397 #endif
1398   }
1399   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1400   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1401   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1402   if (!cusparsestruct->stream) {
1403     ierr = WaitForGPU();CHKERRCUDA(ierr);
1404   }
1405   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 
1410 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1411 {
1412   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1413   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1414   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1415   const PetscScalar            *xarray;
1416   PetscScalar                  *zarray,*dptr,*beta;
1417   PetscErrorCode               ierr;
1418   cusparseStatus_t             stat;
1419 
1420   PetscFunctionBegin;
1421   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1422   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1423   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1424   try {
1425     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1426     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1427     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1428     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1429 
1430     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1431     /* csr_spmv is multiply add */
1432     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1433       /* here we need to be careful to set the number of rows in the multiply to the
1434          number of compressed rows in the matrix ... which is equivalent to the
1435          size of the workVector */
1436       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1437       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1438                                mat->num_rows, mat->num_cols,
1439                                mat->num_entries, matstruct->alpha, matstruct->descr,
1440                                mat->values->data().get(), mat->row_offsets->data().get(),
1441                                mat->column_indices->data().get(), xarray, beta,
1442                                dptr);CHKERRCUDA(stat);
1443     } else {
1444 #if CUDA_VERSION>=4020
1445       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1446       if (cusparsestruct->workVector->size()) {
1447         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1448                                  matstruct->alpha, matstruct->descr, hybMat,
1449                                  xarray, beta,
1450                                  dptr);CHKERRCUDA(stat);
1451       }
1452 #endif
1453     }
1454     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1455 
1456     if (yy) {
1457       if (dptr != zarray) {
1458         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1459       } else if (zz != yy) {
1460         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1461       }
1462     } else if (dptr != zarray) {
1463       ierr = VecSet(zz,0);CHKERRQ(ierr);
1464     }
1465     /* scatter the data from the temporary into the full vector with a += operation */
1466     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1467     if (dptr != zarray) {
1468       thrust::device_ptr<PetscScalar> zptr;
1469 
1470       zptr = thrust::device_pointer_cast(zarray);
1471       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1472                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1473                        VecCUDAPlusEquals());
1474     }
1475     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1476     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1477     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1478   } catch(char *ex) {
1479     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1480   }
1481   if (!yy) { /* MatMult */
1482     if (!cusparsestruct->stream) {
1483       ierr = WaitForGPU();CHKERRCUDA(ierr);
1484     }
1485   }
1486   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1491 {
1492   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1493   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1494   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1495   thrust::device_ptr<PetscScalar> zptr;
1496   const PetscScalar               *xarray;
1497   PetscScalar                     *zarray;
1498   PetscErrorCode                  ierr;
1499   cusparseStatus_t                stat;
1500 
1501   PetscFunctionBegin;
1502   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1503   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1504   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1505   if (!matstructT) {
1506     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1507     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1508   }
1509 
1510   try {
1511     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1512     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1513     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1514     zptr = thrust::device_pointer_cast(zarray);
1515 
1516     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1517     /* multiply add with matrix transpose */
1518     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1519       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1520       /* here we need to be careful to set the number of rows in the multiply to the
1521          number of compressed rows in the matrix ... which is equivalent to the
1522          size of the workVector */
1523       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1524                                mat->num_rows, mat->num_cols,
1525                                mat->num_entries, matstructT->alpha, matstructT->descr,
1526                                mat->values->data().get(), mat->row_offsets->data().get(),
1527                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1528                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1529     } else {
1530 #if CUDA_VERSION>=4020
1531       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1532       if (cusparsestruct->workVector->size()) {
1533         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1534             matstructT->alpha, matstructT->descr, hybMat,
1535             xarray, matstructT->beta_zero,
1536             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1537       }
1538 #endif
1539     }
1540 
1541     /* scatter the data from the temporary into the full vector with a += operation */
1542     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1543         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1544         VecCUDAPlusEquals());
1545     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1546     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1547     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1548 
1549   } catch(char *ex) {
1550     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1551   }
1552   ierr = WaitForGPU();CHKERRCUDA(ierr);
1553   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1558 {
1559   PetscErrorCode ierr;
1560 
1561   PetscFunctionBegin;
1562   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1563   if (A->factortype==MAT_FACTOR_NONE) {
1564     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1565   }
1566   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1567   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1568   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1569   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1570   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 /* --------------------------------------------------------------------------------*/
1575 /*@
1576    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1577    (the default parallel PETSc format). This matrix will ultimately pushed down
1578    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1579    assembly performance the user should preallocate the matrix storage by setting
1580    the parameter nz (or the array nnz).  By setting these parameters accurately,
1581    performance during matrix assembly can be increased by more than a factor of 50.
1582 
1583    Collective
1584 
1585    Input Parameters:
1586 +  comm - MPI communicator, set to PETSC_COMM_SELF
1587 .  m - number of rows
1588 .  n - number of columns
1589 .  nz - number of nonzeros per row (same for all rows)
1590 -  nnz - array containing the number of nonzeros in the various rows
1591          (possibly different for each row) or NULL
1592 
1593    Output Parameter:
1594 .  A - the matrix
1595 
1596    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1597    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1598    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1599 
1600    Notes:
1601    If nnz is given then nz is ignored
1602 
1603    The AIJ format (also called the Yale sparse matrix format or
1604    compressed row storage), is fully compatible with standard Fortran 77
1605    storage.  That is, the stored row and column indices can begin at
1606    either one (as in Fortran) or zero.  See the users' manual for details.
1607 
1608    Specify the preallocated storage with either nz or nnz (not both).
1609    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1610    allocation.  For large problems you MUST preallocate memory or you
1611    will get TERRIBLE performance, see the users' manual chapter on matrices.
1612 
1613    By default, this format uses inodes (identical nodes) when possible, to
1614    improve numerical efficiency of matrix-vector products and solves. We
1615    search for consecutive rows with the same nonzero structure, thereby
1616    reusing matrix information to achieve increased efficiency.
1617 
1618    Level: intermediate
1619 
1620 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1621 @*/
1622 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1623 {
1624   PetscErrorCode ierr;
1625 
1626   PetscFunctionBegin;
1627   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1628   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1629   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1630   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1635 {
1636   PetscErrorCode   ierr;
1637 
1638   PetscFunctionBegin;
1639   if (A->factortype==MAT_FACTOR_NONE) {
1640     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1641       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1642     }
1643   } else {
1644     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1645   }
1646   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1651 {
1652   PetscErrorCode ierr;
1653   Mat C;
1654   cusparseStatus_t stat;
1655   cusparseHandle_t handle=0;
1656 
1657   PetscFunctionBegin;
1658   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1659   C    = *B;
1660   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1661   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1662 
1663   /* inject CUSPARSE-specific stuff */
1664   if (C->factortype==MAT_FACTOR_NONE) {
1665     /* you cannot check the inode.use flag here since the matrix was just created.
1666        now build a GPU matrix data structure */
1667     C->spptr = new Mat_SeqAIJCUSPARSE;
1668     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1669     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1670     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1671     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1672     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1673     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1674     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1675     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1676   } else {
1677     /* NEXT, set the pointers to the triangular factors */
1678     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1679     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1683     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1684     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1685     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1686     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1687     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1688     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1689     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1690   }
1691 
1692   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1693   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1694   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1695   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1696   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1697   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1698   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1699   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1700 
1701   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1702 
1703   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1704 
1705   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1710 {
1711   PetscErrorCode ierr;
1712   cusparseStatus_t stat;
1713   cusparseHandle_t handle=0;
1714 
1715   PetscFunctionBegin;
1716   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1717   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1718 
1719   if (B->factortype==MAT_FACTOR_NONE) {
1720     /* you cannot check the inode.use flag here since the matrix was just created.
1721        now build a GPU matrix data structure */
1722     B->spptr = new Mat_SeqAIJCUSPARSE;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1728     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1729     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1730     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1731   } else {
1732     /* NEXT, set the pointers to the triangular factors */
1733     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1741     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1743     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1744   }
1745 
1746   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1747   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1748   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1749   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1750   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1751   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1752   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1753   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1754 
1755   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1756 
1757   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1758 
1759   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1764 {
1765   PetscErrorCode ierr;
1766 
1767   PetscFunctionBegin;
1768   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1769   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1770   PetscFunctionReturn(0);
1771 }
1772 
1773 /*MC
1774    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1775 
1776    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1777    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1778    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1779 
1780    Options Database Keys:
1781 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1782 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1783 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1784 
1785   Level: beginner
1786 
1787 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1788 M*/
1789 
1790 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1791 
1792 
1793 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1794 {
1795   PetscErrorCode ierr;
1796 
1797   PetscFunctionBegin;
1798   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1799   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1800   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1801   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 
1806 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1807 {
1808   cusparseStatus_t stat;
1809   cusparseHandle_t handle;
1810 
1811   PetscFunctionBegin;
1812   if (*cusparsestruct) {
1813     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1814     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1815     delete (*cusparsestruct)->workVector;
1816     if (handle = (*cusparsestruct)->handle) {
1817       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1818     }
1819     delete *cusparsestruct;
1820     *cusparsestruct = 0;
1821   }
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1826 {
1827   PetscFunctionBegin;
1828   if (*mat) {
1829     delete (*mat)->values;
1830     delete (*mat)->column_indices;
1831     delete (*mat)->row_offsets;
1832     delete *mat;
1833     *mat = 0;
1834   }
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1839 {
1840   cusparseStatus_t stat;
1841   PetscErrorCode   ierr;
1842 
1843   PetscFunctionBegin;
1844   if (*trifactor) {
1845     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1846     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1847     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1848     delete *trifactor;
1849     *trifactor = 0;
1850   }
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1855 {
1856   CsrMatrix        *mat;
1857   cusparseStatus_t stat;
1858   cudaError_t      err;
1859 
1860   PetscFunctionBegin;
1861   if (*matstruct) {
1862     if ((*matstruct)->mat) {
1863       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1864         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1865         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1866       } else {
1867         mat = (CsrMatrix*)(*matstruct)->mat;
1868         CsrMatrix_Destroy(&mat);
1869       }
1870     }
1871     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1872     delete (*matstruct)->cprowIndices;
1873     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1874     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1875     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1876     delete *matstruct;
1877     *matstruct = 0;
1878   }
1879   PetscFunctionReturn(0);
1880 }
1881 
1882 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1883 {
1884   cusparseHandle_t handle;
1885   cusparseStatus_t stat;
1886 
1887   PetscFunctionBegin;
1888   if (*trifactors) {
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1890     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1891     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1892     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1893     delete (*trifactors)->rpermIndices;
1894     delete (*trifactors)->cpermIndices;
1895     delete (*trifactors)->workVector;
1896     if (handle = (*trifactors)->handle) {
1897       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1898     }
1899     delete *trifactors;
1900     *trifactors = 0;
1901   }
1902   PetscFunctionReturn(0);
1903 }
1904 
1905