xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d60bce21cdb268dff810a60347647b642287f33e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 #else
305     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306 #endif
307     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
309     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
310 
311     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
313     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314    #endif
315   }
316   ierr = PetscOptionsTail();CHKERRQ(ierr);
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
328   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
340   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345 {
346   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347   PetscErrorCode               ierr;
348 
349   PetscFunctionBegin;
350   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
351   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
352   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353   PetscFunctionReturn(0);
354 }
355 
356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357 {
358   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359   PetscErrorCode               ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
363   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
364   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365   PetscFunctionReturn(0);
366 }
367 
368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369 {
370   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
371   PetscInt                          n = A->rmap->n;
372   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374   cusparseStatus_t                  stat;
375   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
376   const MatScalar                   *aa = a->a,*v;
377   PetscInt                          *AiLo, *AjLo;
378   PetscInt                          i,nz, nzLower, offset, rowOffset;
379   PetscErrorCode                    ierr;
380   cudaError_t                       cerr;
381 
382   PetscFunctionBegin;
383   if (!n) PetscFunctionReturn(0);
384   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385     try {
386       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387       nzLower=n+ai[n]-ai[1];
388       if (!loTriFactor) {
389         PetscScalar                       *AALo;
390 
391         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
392 
393         /* Allocate Space for the lower triangular matrix */
394         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
396 
397         /* Fill the lower triangular matrix */
398         AiLo[0]  = (PetscInt) 0;
399         AiLo[n]  = nzLower;
400         AjLo[0]  = (PetscInt) 0;
401         AALo[0]  = (MatScalar) 1.0;
402         v        = aa;
403         vi       = aj;
404         offset   = 1;
405         rowOffset= 1;
406         for (i=1; i<n; i++) {
407           nz = ai[i+1] - ai[i];
408           /* additional 1 for the term on the diagonal */
409           AiLo[i]    = rowOffset;
410           rowOffset += nz+1;
411 
412           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
413           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
414 
415           offset      += nz;
416           AjLo[offset] = (PetscInt) i;
417           AALo[offset] = (MatScalar) 1.0;
418           offset      += 1;
419 
420           v  += nz;
421           vi += nz;
422         }
423 
424         /* allocate space for the triangular factor information */
425         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
426         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427         /* Create the matrix description */
428         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432        #else
433         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434        #endif
435         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
437 
438         /* set the operation */
439         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
440 
441         /* set the matrix */
442         loTriFactor->csrMat = new CsrMatrix;
443         loTriFactor->csrMat->num_rows = n;
444         loTriFactor->csrMat->num_cols = n;
445         loTriFactor->csrMat->num_entries = nzLower;
446 
447         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
449 
450         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
452 
453         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
455 
456         /* Create the solve analysis information */
457         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466       #endif
467 
468         /* perform the solve analysis */
469         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472                                  loTriFactor->csrMat->column_indices->data().get(),
473                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474                                  loTriFactor->solveInfo,
475                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476                                #else
477                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478                                #endif
479         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
481 
482         /* assign the pointer */
483         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484         loTriFactor->AA_h = AALo;
485         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488       } else { /* update values only */
489         if (!loTriFactor->AA_h) {
490           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491         }
492         /* Fill the lower triangular matrix */
493         loTriFactor->AA_h[0]  = 1.0;
494         v        = aa;
495         vi       = aj;
496         offset   = 1;
497         for (i=1; i<n; i++) {
498           nz = ai[i+1] - ai[i];
499           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
500           offset      += nz;
501           loTriFactor->AA_h[offset] = 1.0;
502           offset      += 1;
503           v  += nz;
504         }
505         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
507       }
508     } catch(char *ex) {
509       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510     }
511   }
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516 {
517   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518   PetscInt                          n = A->rmap->n;
519   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521   cusparseStatus_t                  stat;
522   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
523   const MatScalar                   *aa = a->a,*v;
524   PetscInt                          *AiUp, *AjUp;
525   PetscInt                          i,nz, nzUpper, offset;
526   PetscErrorCode                    ierr;
527   cudaError_t                       cerr;
528 
529   PetscFunctionBegin;
530   if (!n) PetscFunctionReturn(0);
531   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532     try {
533       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534       nzUpper = adiag[0]-adiag[n];
535       if (!upTriFactor) {
536         PetscScalar *AAUp;
537 
538         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
539 
540         /* Allocate Space for the upper triangular matrix */
541         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
543 
544         /* Fill the upper triangular matrix */
545         AiUp[0]=(PetscInt) 0;
546         AiUp[n]=nzUpper;
547         offset = nzUpper;
548         for (i=n-1; i>=0; i--) {
549           v  = aa + adiag[i+1] + 1;
550           vi = aj + adiag[i+1] + 1;
551 
552           /* number of elements NOT on the diagonal */
553           nz = adiag[i] - adiag[i+1]-1;
554 
555           /* decrement the offset */
556           offset -= (nz+1);
557 
558           /* first, set the diagonal elements */
559           AjUp[offset] = (PetscInt) i;
560           AAUp[offset] = (MatScalar)1./v[nz];
561           AiUp[i]      = AiUp[i+1] - (nz+1);
562 
563           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
564           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
565         }
566 
567         /* allocate space for the triangular factor information */
568         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
569         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
570 
571         /* Create the matrix description */
572         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576        #else
577         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578        #endif
579         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
581 
582         /* set the operation */
583         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584 
585         /* set the matrix */
586         upTriFactor->csrMat = new CsrMatrix;
587         upTriFactor->csrMat->num_rows = n;
588         upTriFactor->csrMat->num_cols = n;
589         upTriFactor->csrMat->num_entries = nzUpper;
590 
591         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593 
594         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596 
597         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599 
600         /* Create the solve analysis information */
601         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610       #endif
611 
612         /* perform the solve analysis */
613         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616                                  upTriFactor->csrMat->column_indices->data().get(),
617                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618                                  upTriFactor->solveInfo,
619                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620                                #else
621                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622                                #endif
623         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
625 
626         /* assign the pointer */
627         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628         upTriFactor->AA_h = AAUp;
629         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632       } else {
633         if (!upTriFactor->AA_h) {
634           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635         }
636         /* Fill the upper triangular matrix */
637         offset = nzUpper;
638         for (i=n-1; i>=0; i--) {
639           v  = aa + adiag[i+1] + 1;
640 
641           /* number of elements NOT on the diagonal */
642           nz = adiag[i] - adiag[i+1]-1;
643 
644           /* decrement the offset */
645           offset -= (nz+1);
646 
647           /* first, set the diagonal elements */
648           upTriFactor->AA_h[offset] = 1./v[nz];
649           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
650         }
651         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
653       }
654     } catch(char *ex) {
655       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662 {
663   PetscErrorCode               ierr;
664   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
665   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666   IS                           isrow = a->row,iscol = a->icol;
667   PetscBool                    row_identity,col_identity;
668   PetscInt                     n = A->rmap->n;
669 
670   PetscFunctionBegin;
671   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
673   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
674 
675   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676   cusparseTriFactors->nnz=a->nz;
677 
678   A->offloadmask = PETSC_OFFLOAD_BOTH;
679   /* lower triangular indices */
680   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
681   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682     const PetscInt *r;
683 
684     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
685     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686     cusparseTriFactors->rpermIndices->assign(r, r+n);
687     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
688     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
689   }
690 
691   /* upper triangular indices */
692   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
693   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694     const PetscInt *c;
695 
696     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(c, c+n);
699     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
700     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
701   }
702   PetscFunctionReturn(0);
703 }
704 
705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706 {
707   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711   cusparseStatus_t                  stat;
712   PetscErrorCode                    ierr;
713   cudaError_t                       cerr;
714   PetscInt                          *AiUp, *AjUp;
715   PetscScalar                       *AAUp;
716   PetscScalar                       *AALo;
717   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720   const MatScalar                   *aa = b->a,*v;
721 
722   PetscFunctionBegin;
723   if (!n) PetscFunctionReturn(0);
724   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725     try {
726       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728       if (!upTriFactor && !loTriFactor) {
729         /* Allocate Space for the upper triangular matrix */
730         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
732 
733         /* Fill the upper triangular matrix */
734         AiUp[0]=(PetscInt) 0;
735         AiUp[n]=nzUpper;
736         offset = 0;
737         for (i=0; i<n; i++) {
738           /* set the pointers */
739           v  = aa + ai[i];
740           vj = aj + ai[i];
741           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
742 
743           /* first, set the diagonal elements */
744           AjUp[offset] = (PetscInt) i;
745           AAUp[offset] = (MatScalar)1.0/v[nz];
746           AiUp[i]      = offset;
747           AALo[offset] = (MatScalar)1.0/v[nz];
748 
749           offset+=1;
750           if (nz>0) {
751             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
752             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
753             for (j=offset; j<offset+nz; j++) {
754               AAUp[j] = -AAUp[j];
755               AALo[j] = AAUp[j]/v[nz];
756             }
757             offset+=nz;
758           }
759         }
760 
761         /* allocate space for the triangular factor information */
762         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
763         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
764 
765         /* Create the matrix description */
766         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770        #else
771         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772        #endif
773         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
775 
776         /* set the matrix */
777         upTriFactor->csrMat = new CsrMatrix;
778         upTriFactor->csrMat->num_rows = A->rmap->n;
779         upTriFactor->csrMat->num_cols = A->cmap->n;
780         upTriFactor->csrMat->num_entries = a->nz;
781 
782         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
784 
785         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
787 
788         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
790 
791         /* set the operation */
792         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
793 
794         /* Create the solve analysis information */
795         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804       #endif
805 
806         /* perform the solve analysis */
807         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810                                  upTriFactor->csrMat->column_indices->data().get(),
811                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812                                  upTriFactor->solveInfo,
813                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814                                 #else
815                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816                                 #endif
817         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
819 
820         /* assign the pointer */
821         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
822 
823         /* allocate space for the triangular factor information */
824         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
825         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832        #else
833         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834        #endif
835         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
837 
838         /* set the operation */
839         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
840 
841         /* set the matrix */
842         loTriFactor->csrMat = new CsrMatrix;
843         loTriFactor->csrMat->num_rows = A->rmap->n;
844         loTriFactor->csrMat->num_cols = A->cmap->n;
845         loTriFactor->csrMat->num_entries = a->nz;
846 
847         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
849 
850         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
852 
853         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
855 
856         /* Create the solve analysis information */
857         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866       #endif
867 
868         /* perform the solve analysis */
869         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872                                  loTriFactor->csrMat->column_indices->data().get(),
873                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874                                  loTriFactor->solveInfo,
875                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876                                 #else
877                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878                                 #endif
879         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881 
882         /* assign the pointer */
883         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
884 
885         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
886         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888       } else {
889         /* Fill the upper triangular matrix */
890         offset = 0;
891         for (i=0; i<n; i++) {
892           /* set the pointers */
893           v  = aa + ai[i];
894           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
895 
896           /* first, set the diagonal elements */
897           AAUp[offset] = 1.0/v[nz];
898           AALo[offset] = 1.0/v[nz];
899 
900           offset+=1;
901           if (nz>0) {
902             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
903             for (j=offset; j<offset+nz; j++) {
904               AAUp[j] = -AAUp[j];
905               AALo[j] = AAUp[j]/v[nz];
906             }
907             offset+=nz;
908           }
909         }
910         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
915       }
916       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918     } catch(char *ex) {
919       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920     }
921   }
922   PetscFunctionReturn(0);
923 }
924 
925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926 {
927   PetscErrorCode               ierr;
928   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930   IS                           ip = a->row;
931   PetscBool                    perm_identity;
932   PetscInt                     n = A->rmap->n;
933 
934   PetscFunctionBegin;
935   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
937   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
939 
940   A->offloadmask = PETSC_OFFLOAD_BOTH;
941 
942   /* lower triangular indices */
943   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
944   if (!perm_identity) {
945     IS             iip;
946     const PetscInt *irip,*rip;
947 
948     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
949     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
950     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
951     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
956     ierr = ISDestroy(&iip);CHKERRQ(ierr);
957     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
958     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
959   }
960   PetscFunctionReturn(0);
961 }
962 
963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964 {
965   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966   IS             ip = b->row;
967   PetscBool      perm_identity;
968   PetscErrorCode ierr;
969 
970   PetscFunctionBegin;
971   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
972   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
973   B->offloadmask = PETSC_OFFLOAD_CPU;
974   /* determine which version of MatSolve needs to be used. */
975   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
976   if (perm_identity) {
977     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979     B->ops->matsolve = NULL;
980     B->ops->matsolvetranspose = NULL;
981   } else {
982     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984     B->ops->matsolve = NULL;
985     B->ops->matsolvetranspose = NULL;
986   }
987 
988   /* get the triangular factors */
989   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
990   PetscFunctionReturn(0);
991 }
992 
993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994 {
995   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000   cusparseStatus_t                  stat;
1001   cusparseIndexBase_t               indexBase;
1002   cusparseMatrixType_t              matrixType;
1003   cusparseFillMode_t                fillMode;
1004   cusparseDiagType_t                diagType;
1005   cudaError_t                       cerr;
1006   PetscErrorCode                    ierr;
1007 
1008   PetscFunctionBegin;
1009   /* allocate space for the transpose of the lower triangular factor */
1010   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1011   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1012 
1013   /* set the matrix descriptors of the lower triangular factor */
1014   matrixType = cusparseGetMatType(loTriFactor->descr);
1015   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1019 
1020   /* Create the matrix description */
1021   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1026 
1027   /* set the operation */
1028   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1029 
1030   /* allocate GPU space for the CSC of the lower triangular factor*/
1031   loTriFactorT->csrMat = new CsrMatrix;
1032   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1038 
1039   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043                                        loTriFactor->csrMat->values->data().get(),
1044                                        loTriFactor->csrMat->row_offsets->data().get(),
1045                                        loTriFactor->csrMat->column_indices->data().get(),
1046                                        loTriFactorT->csrMat->values->data().get(),
1047                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051 #endif
1052 
1053   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1054   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056                           loTriFactor->csrMat->values->data().get(),
1057                           loTriFactor->csrMat->row_offsets->data().get(),
1058                           loTriFactor->csrMat->column_indices->data().get(),
1059                           loTriFactorT->csrMat->values->data().get(),
1060                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064                         #else
1065                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067                         #endif
1068   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1070 
1071   /* Create the solve analysis information */
1072   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1073   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081 #endif
1082 
1083   /* perform the solve analysis */
1084   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087                            loTriFactorT->csrMat->column_indices->data().get(),
1088                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089                            loTriFactorT->solveInfo,
1090                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091                           #else
1092                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093                           #endif
1094   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1096 
1097   /* assign the pointer */
1098   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1099 
1100   /*********************************************/
1101   /* Now the Transpose of the Upper Tri Factor */
1102   /*********************************************/
1103 
1104   /* allocate space for the transpose of the upper triangular factor */
1105   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1106   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1107 
1108   /* set the matrix descriptors of the upper triangular factor */
1109   matrixType = cusparseGetMatType(upTriFactor->descr);
1110   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1114 
1115   /* Create the matrix description */
1116   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1121 
1122   /* set the operation */
1123   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1124 
1125   /* allocate GPU space for the CSC of the upper triangular factor*/
1126   upTriFactorT->csrMat = new CsrMatrix;
1127   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1133 
1134   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138                                 upTriFactor->csrMat->values->data().get(),
1139                                 upTriFactor->csrMat->row_offsets->data().get(),
1140                                 upTriFactor->csrMat->column_indices->data().get(),
1141                                 upTriFactorT->csrMat->values->data().get(),
1142                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146 #endif
1147 
1148   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151                           upTriFactor->csrMat->values->data().get(),
1152                           upTriFactor->csrMat->row_offsets->data().get(),
1153                           upTriFactor->csrMat->column_indices->data().get(),
1154                           upTriFactorT->csrMat->values->data().get(),
1155                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159                         #else
1160                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162                         #endif
1163 
1164   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1166 
1167   /* Create the solve analysis information */
1168   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1169   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177   #endif
1178 
1179   /* perform the solve analysis */
1180   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183                            upTriFactorT->csrMat->column_indices->data().get(),
1184                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185                            upTriFactorT->solveInfo,
1186                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187                           #else
1188                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189                           #endif
1190 
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1220   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1221   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1222   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1223   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1224   if (A->transupdated) PetscFunctionReturn(0);
1225   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1226   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1227   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236 
1237     /* set alpha and beta */
1238     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT = new CsrMatrix;
1247       matstructT->mat = matrixT;
1248       matrixT->num_rows = A->cmap->n;
1249       matrixT->num_cols = A->rmap->n;
1250       matrixT->num_entries = a->nz;
1251       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257 
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1260         stat = cusparseCreateCsr(&matstructT->matDescr,
1261                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263                                matrixT->values->data().get(),
1264                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266       #else
1267         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1268            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1269 
1270            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1271            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1272            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1273         */
1274         if (matrixT->num_entries) {
1275           stat = cusparseCreateCsr(&matstructT->matDescr,
1276                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278                                  matrixT->values->data().get(),
1279                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1280                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1281 
1282         } else {
1283           matstructT->matDescr = NULL;
1284           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1285         }
1286       #endif
1287      #endif
1288     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1289    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1291    #else
1292       CsrMatrix *temp  = new CsrMatrix;
1293       CsrMatrix *tempT = new CsrMatrix;
1294       /* First convert HYB to CSR */
1295       temp->num_rows = A->rmap->n;
1296       temp->num_cols = A->cmap->n;
1297       temp->num_entries = a->nz;
1298       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1300       temp->values = new THRUSTARRAY(a->nz);
1301 
1302       stat = cusparse_hyb2csr(cusparsestruct->handle,
1303                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1304                               temp->values->data().get(),
1305                               temp->row_offsets->data().get(),
1306                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1307 
1308       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1309       tempT->num_rows = A->rmap->n;
1310       tempT->num_cols = A->cmap->n;
1311       tempT->num_entries = a->nz;
1312       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1313       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1314       tempT->values = new THRUSTARRAY(a->nz);
1315 
1316       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1317                               temp->num_cols, temp->num_entries,
1318                               temp->values->data().get(),
1319                               temp->row_offsets->data().get(),
1320                               temp->column_indices->data().get(),
1321                               tempT->values->data().get(),
1322                               tempT->column_indices->data().get(),
1323                               tempT->row_offsets->data().get(),
1324                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1325 
1326       /* Last, convert CSC to HYB */
1327       cusparseHybMat_t hybMat;
1328       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1329       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1330         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1331       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1332                               matstructT->descr, tempT->values->data().get(),
1333                               tempT->row_offsets->data().get(),
1334                               tempT->column_indices->data().get(),
1335                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1336 
1337       /* assign the pointer */
1338       matstructT->mat = hybMat;
1339       A->transupdated = PETSC_TRUE;
1340       /* delete temporaries */
1341       if (tempT) {
1342         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1343         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1344         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1345         delete (CsrMatrix*) tempT;
1346       }
1347       if (temp) {
1348         if (temp->values) delete (THRUSTARRAY*) temp->values;
1349         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1350         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1351         delete (CsrMatrix*) temp;
1352       }
1353      #endif
1354     }
1355   }
1356   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1357     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1358     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1359     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1360     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1361     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1362     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1363     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1364     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1365     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1366     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1367     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1368       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1369       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1370       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1371     }
1372     if (!cusparsestruct->csr2csc_i) {
1373       THRUSTARRAY csr2csc_a(matrix->num_entries);
1374       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1375 
1376       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1377      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378       void   *csr2cscBuffer;
1379       size_t csr2cscBufferSize;
1380       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1381                                            A->cmap->n, matrix->num_entries,
1382                                            matrix->values->data().get(),
1383                                            cusparsestruct->rowoffsets_gpu->data().get(),
1384                                            matrix->column_indices->data().get(),
1385                                            matrixT->values->data().get(),
1386                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1388                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1389       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1390      #endif
1391 
1392       if (matrix->num_entries) {
1393         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1394            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1395            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1396 
1397            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1398            should be filled with indexBase. So I just take a shortcut here.
1399         */
1400         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1401                               A->cmap->n,matrix->num_entries,
1402                               csr2csc_a.data().get(),
1403                               cusparsestruct->rowoffsets_gpu->data().get(),
1404                               matrix->column_indices->data().get(),
1405                               matrixT->values->data().get(),
1406                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1408                               CUSPARSE_ACTION_NUMERIC,indexBase,
1409                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1410                              #else
1411                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1412                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1413                              #endif
1414       } else {
1415         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1416       }
1417 
1418       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1419       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1420      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1421       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1422      #endif
1423     }
1424     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1425                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1426                                                      matrixT->values->begin()));
1427   }
1428   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1429   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1430   /* the compressed row indices is not used for matTranspose */
1431   matstructT->cprowIndices = NULL;
1432   /* assign the pointer */
1433   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1434   A->transupdated = PETSC_TRUE;
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1439 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1440 {
1441   PetscInt                              n = xx->map->n;
1442   const PetscScalar                     *barray;
1443   PetscScalar                           *xarray;
1444   thrust::device_ptr<const PetscScalar> bGPU;
1445   thrust::device_ptr<PetscScalar>       xGPU;
1446   cusparseStatus_t                      stat;
1447   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1448   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1449   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1450   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1451   PetscErrorCode                        ierr;
1452 
1453   PetscFunctionBegin;
1454   /* Analyze the matrix and create the transpose ... on the fly */
1455   if (!loTriFactorT && !upTriFactorT) {
1456     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1457     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459   }
1460 
1461   /* Get the GPU pointers */
1462   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1463   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1464   xGPU = thrust::device_pointer_cast(xarray);
1465   bGPU = thrust::device_pointer_cast(barray);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, reorder with the row permutation */
1469   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1470                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1471                xGPU);
1472 
1473   /* First, solve U */
1474   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1475                         upTriFactorT->csrMat->num_rows,
1476                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477                         upTriFactorT->csrMat->num_entries,
1478                       #endif
1479                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1480                         upTriFactorT->csrMat->values->data().get(),
1481                         upTriFactorT->csrMat->row_offsets->data().get(),
1482                         upTriFactorT->csrMat->column_indices->data().get(),
1483                         upTriFactorT->solveInfo,
1484                         xarray,
1485                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486                         tempGPU->data().get(),
1487                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1488                       #else
1489                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1490                       #endif
1491 
1492   /* Then, solve L */
1493   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1494                         loTriFactorT->csrMat->num_rows,
1495                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1496                         loTriFactorT->csrMat->num_entries,
1497                       #endif
1498                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1499                         loTriFactorT->csrMat->values->data().get(),
1500                         loTriFactorT->csrMat->row_offsets->data().get(),
1501                         loTriFactorT->csrMat->column_indices->data().get(),
1502                         loTriFactorT->solveInfo,
1503                         tempGPU->data().get(),
1504                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505                         xarray,
1506                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1507                       #else
1508                          xarray);CHKERRCUSPARSE(stat);
1509                       #endif
1510 
1511   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1512   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1513                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1514                tempGPU->begin());
1515 
1516   /* Copy the temporary to the full solution. */
1517   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1518 
1519   /* restore */
1520   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1523   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1528 {
1529   const PetscScalar                 *barray;
1530   PetscScalar                       *xarray;
1531   cusparseStatus_t                  stat;
1532   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1533   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1534   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1535   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1536   PetscErrorCode                    ierr;
1537 
1538   PetscFunctionBegin;
1539   /* Analyze the matrix and create the transpose ... on the fly */
1540   if (!loTriFactorT && !upTriFactorT) {
1541     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1542     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1543     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1544   }
1545 
1546   /* Get the GPU pointers */
1547   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1548   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1549 
1550   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1551   /* First, solve U */
1552   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1553                         upTriFactorT->csrMat->num_rows,
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         upTriFactorT->csrMat->num_entries,
1556                       #endif
1557                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1558                         upTriFactorT->csrMat->values->data().get(),
1559                         upTriFactorT->csrMat->row_offsets->data().get(),
1560                         upTriFactorT->csrMat->column_indices->data().get(),
1561                         upTriFactorT->solveInfo,
1562                         barray,
1563                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564                         tempGPU->data().get(),
1565                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566                       #else
1567                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1568                       #endif
1569 
1570   /* Then, solve L */
1571   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1572                         loTriFactorT->csrMat->num_rows,
1573                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1574                         loTriFactorT->csrMat->num_entries,
1575                       #endif
1576                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1577                         loTriFactorT->csrMat->values->data().get(),
1578                         loTriFactorT->csrMat->row_offsets->data().get(),
1579                         loTriFactorT->csrMat->column_indices->data().get(),
1580                         loTriFactorT->solveInfo,
1581                         tempGPU->data().get(),
1582                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1583                         xarray,
1584                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1585                       #else
1586                         xarray);CHKERRCUSPARSE(stat);
1587                       #endif
1588 
1589   /* restore */
1590   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1591   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1592   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1593   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1598 {
1599   const PetscScalar                     *barray;
1600   PetscScalar                           *xarray;
1601   thrust::device_ptr<const PetscScalar> bGPU;
1602   thrust::device_ptr<PetscScalar>       xGPU;
1603   cusparseStatus_t                      stat;
1604   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1605   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1606   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1607   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1608   PetscErrorCode                        ierr;
1609 
1610   PetscFunctionBegin;
1611 
1612   /* Get the GPU pointers */
1613   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1614   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1615   xGPU = thrust::device_pointer_cast(xarray);
1616   bGPU = thrust::device_pointer_cast(barray);
1617 
1618   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1619   /* First, reorder with the row permutation */
1620   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1621                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1622                tempGPU->begin());
1623 
1624   /* Next, solve L */
1625   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1626                         loTriFactor->csrMat->num_rows,
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         loTriFactor->csrMat->num_entries,
1629                       #endif
1630                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1631                         loTriFactor->csrMat->values->data().get(),
1632                         loTriFactor->csrMat->row_offsets->data().get(),
1633                         loTriFactor->csrMat->column_indices->data().get(),
1634                         loTriFactor->solveInfo,
1635                         tempGPU->data().get(),
1636                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637                          xarray,
1638                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1639                       #else
1640                          xarray);CHKERRCUSPARSE(stat);
1641                       #endif
1642 
1643   /* Then, solve U */
1644   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1645                         upTriFactor->csrMat->num_rows,
1646                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647                         upTriFactor->csrMat->num_entries,
1648                       #endif
1649                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1650                         upTriFactor->csrMat->values->data().get(),
1651                         upTriFactor->csrMat->row_offsets->data().get(),
1652                         upTriFactor->csrMat->column_indices->data().get(),
1653                         upTriFactor->solveInfo,xarray,
1654                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655                         tempGPU->data().get(),
1656                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1657                       #else
1658                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1659                       #endif
1660 
1661   /* Last, reorder with the column permutation */
1662   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1663                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1664                xGPU);
1665 
1666   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1667   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1668   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1669   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1674 {
1675   const PetscScalar                 *barray;
1676   PetscScalar                       *xarray;
1677   cusparseStatus_t                  stat;
1678   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1679   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1680   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1681   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1682   PetscErrorCode                    ierr;
1683 
1684   PetscFunctionBegin;
1685   /* Get the GPU pointers */
1686   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1687   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1688 
1689   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1690   /* First, solve L */
1691   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1692                         loTriFactor->csrMat->num_rows,
1693                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694                         loTriFactor->csrMat->num_entries,
1695                       #endif
1696                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1697                         loTriFactor->csrMat->values->data().get(),
1698                         loTriFactor->csrMat->row_offsets->data().get(),
1699                         loTriFactor->csrMat->column_indices->data().get(),
1700                         loTriFactor->solveInfo,
1701                         barray,
1702                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1703                         tempGPU->data().get(),
1704                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1705                       #else
1706                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1707                       #endif
1708 
1709   /* Next, solve U */
1710   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1711                         upTriFactor->csrMat->num_rows,
1712                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713                         upTriFactor->csrMat->num_entries,
1714                       #endif
1715                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1716                         upTriFactor->csrMat->values->data().get(),
1717                         upTriFactor->csrMat->row_offsets->data().get(),
1718                         upTriFactor->csrMat->column_indices->data().get(),
1719                         upTriFactor->solveInfo,
1720                         tempGPU->data().get(),
1721                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1722                         xarray,
1723                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1724                       #else
1725                         xarray);CHKERRCUSPARSE(stat);
1726                       #endif
1727 
1728   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1736 {
1737   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1738   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1739   cudaError_t        cerr;
1740   PetscErrorCode     ierr;
1741 
1742   PetscFunctionBegin;
1743   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1744     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1745 
1746     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1747     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1748     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1749     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1750     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1751     A->offloadmask = PETSC_OFFLOAD_BOTH;
1752   }
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1757 {
1758   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1759   PetscErrorCode ierr;
1760 
1761   PetscFunctionBegin;
1762   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1763   *array = a->a;
1764   A->offloadmask = PETSC_OFFLOAD_CPU;
1765   PetscFunctionReturn(0);
1766 }
1767 
1768 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1769 {
1770   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1771   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1772   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1773   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1774   PetscErrorCode               ierr;
1775   cusparseStatus_t             stat;
1776   PetscBool                    both = PETSC_TRUE;
1777   cudaError_t                  err;
1778 
1779   PetscFunctionBegin;
1780   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1781   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1782     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1783       CsrMatrix *matrix;
1784       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1785 
1786       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1787       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1788       matrix->values->assign(a->a, a->a+a->nz);
1789       err  = WaitForCUDA();CHKERRCUDA(err);
1790       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1791       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1792       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1793     } else {
1794       PetscInt nnz;
1795       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1796       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1797       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1798       delete cusparsestruct->workVector;
1799       delete cusparsestruct->rowoffsets_gpu;
1800       cusparsestruct->workVector = NULL;
1801       cusparsestruct->rowoffsets_gpu = NULL;
1802       try {
1803         if (a->compressedrow.use) {
1804           m    = a->compressedrow.nrows;
1805           ii   = a->compressedrow.i;
1806           ridx = a->compressedrow.rindex;
1807         } else {
1808           m    = A->rmap->n;
1809           ii   = a->i;
1810           ridx = NULL;
1811         }
1812         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1813         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1814         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1815         else nnz = a->nz;
1816 
1817         /* create cusparse matrix */
1818         cusparsestruct->nrows = m;
1819         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1820         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1821         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1822         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1823 
1824         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1825         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1826         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1827         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1828         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1829         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1830         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1831 
1832         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1833         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1834           /* set the matrix */
1835           CsrMatrix *mat= new CsrMatrix;
1836           mat->num_rows = m;
1837           mat->num_cols = A->cmap->n;
1838           mat->num_entries = nnz;
1839           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1840           mat->row_offsets->assign(ii, ii + m+1);
1841 
1842           mat->column_indices = new THRUSTINTARRAY32(nnz);
1843           mat->column_indices->assign(a->j, a->j+nnz);
1844 
1845           mat->values = new THRUSTARRAY(nnz);
1846           if (a->a) mat->values->assign(a->a, a->a+nnz);
1847 
1848           /* assign the pointer */
1849           matstruct->mat = mat;
1850          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1851           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1852             stat = cusparseCreateCsr(&matstruct->matDescr,
1853                                     mat->num_rows, mat->num_cols, mat->num_entries,
1854                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1855                                     mat->values->data().get(),
1856                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858           }
1859          #endif
1860         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1861          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1863          #else
1864           CsrMatrix *mat= new CsrMatrix;
1865           mat->num_rows = m;
1866           mat->num_cols = A->cmap->n;
1867           mat->num_entries = nnz;
1868           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1869           mat->row_offsets->assign(ii, ii + m+1);
1870 
1871           mat->column_indices = new THRUSTINTARRAY32(nnz);
1872           mat->column_indices->assign(a->j, a->j+nnz);
1873 
1874           mat->values = new THRUSTARRAY(nnz);
1875           if (a->a) mat->values->assign(a->a, a->a+nnz);
1876 
1877           cusparseHybMat_t hybMat;
1878           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1879           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1880             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1881           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1882               matstruct->descr, mat->values->data().get(),
1883               mat->row_offsets->data().get(),
1884               mat->column_indices->data().get(),
1885               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1886           /* assign the pointer */
1887           matstruct->mat = hybMat;
1888 
1889           if (mat) {
1890             if (mat->values) delete (THRUSTARRAY*)mat->values;
1891             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1892             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1893             delete (CsrMatrix*)mat;
1894           }
1895          #endif
1896         }
1897 
1898         /* assign the compressed row indices */
1899         if (a->compressedrow.use) {
1900           cusparsestruct->workVector = new THRUSTARRAY(m);
1901           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1902           matstruct->cprowIndices->assign(ridx,ridx+m);
1903           tmp = m;
1904         } else {
1905           cusparsestruct->workVector = NULL;
1906           matstruct->cprowIndices    = NULL;
1907           tmp = 0;
1908         }
1909         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1910 
1911         /* assign the pointer */
1912         cusparsestruct->mat = matstruct;
1913       } catch(char *ex) {
1914         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1915       }
1916       err  = WaitForCUDA();CHKERRCUDA(err);
1917       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1918       cusparsestruct->nonzerostate = A->nonzerostate;
1919     }
1920     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1921   }
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 struct VecCUDAPlusEquals
1926 {
1927   template <typename Tuple>
1928   __host__ __device__
1929   void operator()(Tuple t)
1930   {
1931     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1932   }
1933 };
1934 
1935 struct VecCUDAEquals
1936 {
1937   template <typename Tuple>
1938   __host__ __device__
1939   void operator()(Tuple t)
1940   {
1941     thrust::get<1>(t) = thrust::get<0>(t);
1942   }
1943 };
1944 
1945 struct VecCUDAEqualsReverse
1946 {
1947   template <typename Tuple>
1948   __host__ __device__
1949   void operator()(Tuple t)
1950   {
1951     thrust::get<0>(t) = thrust::get<1>(t);
1952   }
1953 };
1954 
1955 struct MatMatCusparse {
1956   PetscBool             cisdense;
1957   PetscScalar           *Bt;
1958   Mat                   X;
1959   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1960   PetscLogDouble        flops;
1961   CsrMatrix             *Bcsr;
1962 
1963 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964   cusparseSpMatDescr_t  matSpBDescr;
1965   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1966   cusparseDnMatDescr_t  matBDescr;
1967   cusparseDnMatDescr_t  matCDescr;
1968   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1969  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1970   void                  *dBuffer4;
1971   void                  *dBuffer5;
1972  #endif
1973   size_t                mmBufferSize;
1974   void                  *mmBuffer;
1975   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1976   cusparseSpGEMMDescr_t spgemmDesc;
1977 #endif
1978 };
1979 
1980 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1981 {
1982   PetscErrorCode   ierr;
1983   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1984   cudaError_t      cerr;
1985  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1986   cusparseStatus_t stat;
1987  #endif
1988 
1989   PetscFunctionBegin;
1990   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1991   delete mmdata->Bcsr;
1992  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1993   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1994   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1995   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1996   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1997  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1998   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1999   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2000  #endif
2001   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2002   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2003  #endif
2004   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2005   ierr = PetscFree(data);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2010 
2011 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2012 {
2013   Mat_Product                  *product = C->product;
2014   Mat                          A,B;
2015   PetscInt                     m,n,blda,clda;
2016   PetscBool                    flg,biscuda;
2017   Mat_SeqAIJCUSPARSE           *cusp;
2018   cusparseStatus_t             stat;
2019   cusparseOperation_t          opA;
2020   const PetscScalar            *barray;
2021   PetscScalar                  *carray;
2022   PetscErrorCode               ierr;
2023   MatMatCusparse               *mmdata;
2024   Mat_SeqAIJCUSPARSEMultStruct *mat;
2025   CsrMatrix                    *csrmat;
2026 
2027   PetscFunctionBegin;
2028   MatCheckProduct(C,1);
2029   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2030   mmdata = (MatMatCusparse*)product->data;
2031   A    = product->A;
2032   B    = product->B;
2033   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2034   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2035   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2036      Instead of silently accepting the wrong answer, I prefer to raise the error */
2037   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2038   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2039   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2040   switch (product->type) {
2041   case MATPRODUCT_AB:
2042   case MATPRODUCT_PtAP:
2043     mat = cusp->mat;
2044     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2045     m   = A->rmap->n;
2046     n   = B->cmap->n;
2047     break;
2048   case MATPRODUCT_AtB:
2049     if (!A->form_explicit_transpose) {
2050       mat = cusp->mat;
2051       opA = CUSPARSE_OPERATION_TRANSPOSE;
2052     } else {
2053       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2054       mat  = cusp->matTranspose;
2055       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056     }
2057     m = A->cmap->n;
2058     n = B->cmap->n;
2059     break;
2060   case MATPRODUCT_ABt:
2061   case MATPRODUCT_RARt:
2062     mat = cusp->mat;
2063     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2064     m   = A->rmap->n;
2065     n   = B->rmap->n;
2066     break;
2067   default:
2068     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2069   }
2070   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2071   csrmat = (CsrMatrix*)mat->mat;
2072   /* if the user passed a CPU matrix, copy the data to the GPU */
2073   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2074   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2075   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2076 
2077   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2078   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2079     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2080     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2081   } else {
2082     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2083     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2084   }
2085 
2086   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2087  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2089   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2090   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2091     size_t mmBufferSize;
2092     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2093     if (!mmdata->matBDescr) {
2094       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2095       mmdata->Blda = blda;
2096     }
2097 
2098     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2099     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2100       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2101       mmdata->Clda = clda;
2102     }
2103 
2104     if (!mat->matDescr) {
2105       stat = cusparseCreateCsr(&mat->matDescr,
2106                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2107                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2108                                csrmat->values->data().get(),
2109                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2110                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2111     }
2112     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2113                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2114                                    mmdata->matCDescr,cusparse_scalartype,
2115                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2116     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2117       cudaError_t cerr;
2118       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2119       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2120       mmdata->mmBufferSize = mmBufferSize;
2121     }
2122     mmdata->initialized = PETSC_TRUE;
2123   } else {
2124     /* to be safe, always update pointers of the mats */
2125     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2126     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2127     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2128   }
2129 
2130   /* do cusparseSpMM, which supports transpose on B */
2131   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2132                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2133                       mmdata->matCDescr,cusparse_scalartype,
2134                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2135  #else
2136   PetscInt k;
2137   /* cusparseXcsrmm does not support transpose on B */
2138   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2139     cublasHandle_t cublasv2handle;
2140     cublasStatus_t cerr;
2141 
2142     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2143     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2144                        B->cmap->n,B->rmap->n,
2145                        &PETSC_CUSPARSE_ONE ,barray,blda,
2146                        &PETSC_CUSPARSE_ZERO,barray,blda,
2147                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2148     blda = B->cmap->n;
2149     k    = B->cmap->n;
2150   } else {
2151     k    = B->rmap->n;
2152   }
2153 
2154   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2155   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2156                            csrmat->num_entries,mat->alpha_one,mat->descr,
2157                            csrmat->values->data().get(),
2158                            csrmat->row_offsets->data().get(),
2159                            csrmat->column_indices->data().get(),
2160                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2161                            carray,clda);CHKERRCUSPARSE(stat);
2162  #endif
2163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2164   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2165   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2166   if (product->type == MATPRODUCT_RARt) {
2167     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2168     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2169   } else if (product->type == MATPRODUCT_PtAP) {
2170     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2171     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2172   } else {
2173     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2174   }
2175   if (mmdata->cisdense) {
2176     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2177   }
2178   if (!biscuda) {
2179     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2180   }
2181   PetscFunctionReturn(0);
2182 }
2183 
2184 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2185 {
2186   Mat_Product        *product = C->product;
2187   Mat                A,B;
2188   PetscInt           m,n;
2189   PetscBool          cisdense,flg;
2190   PetscErrorCode     ierr;
2191   MatMatCusparse     *mmdata;
2192   Mat_SeqAIJCUSPARSE *cusp;
2193 
2194   PetscFunctionBegin;
2195   MatCheckProduct(C,1);
2196   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2197   A    = product->A;
2198   B    = product->B;
2199   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2200   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2201   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2202   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2203   switch (product->type) {
2204   case MATPRODUCT_AB:
2205     m = A->rmap->n;
2206     n = B->cmap->n;
2207     break;
2208   case MATPRODUCT_AtB:
2209     m = A->cmap->n;
2210     n = B->cmap->n;
2211     break;
2212   case MATPRODUCT_ABt:
2213     m = A->rmap->n;
2214     n = B->rmap->n;
2215     break;
2216   case MATPRODUCT_PtAP:
2217     m = B->cmap->n;
2218     n = B->cmap->n;
2219     break;
2220   case MATPRODUCT_RARt:
2221     m = B->rmap->n;
2222     n = B->rmap->n;
2223     break;
2224   default:
2225     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2226   }
2227   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2228   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2229   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2230   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2231 
2232   /* product data */
2233   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2234   mmdata->cisdense = cisdense;
2235  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2236   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2237   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2238     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2239   }
2240  #endif
2241   /* for these products we need intermediate storage */
2242   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2243     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2244     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2245     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2246       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2247     } else {
2248       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2249     }
2250   }
2251   C->product->data    = mmdata;
2252   C->product->destroy = MatDestroy_MatMatCusparse;
2253 
2254   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2255   PetscFunctionReturn(0);
2256 }
2257 
2258 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2259 {
2260   Mat_Product                  *product = C->product;
2261   Mat                          A,B;
2262   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2263   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2264   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2265   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2266   PetscBool                    flg;
2267   PetscErrorCode               ierr;
2268   cusparseStatus_t             stat;
2269   cudaError_t                  cerr;
2270   MatProductType               ptype;
2271   MatMatCusparse               *mmdata;
2272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273   cusparseSpMatDescr_t         BmatSpDescr;
2274 #endif
2275   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2276 
2277   PetscFunctionBegin;
2278   MatCheckProduct(C,1);
2279   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2280   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2281   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2282   mmdata = (MatMatCusparse*)C->product->data;
2283   A = product->A;
2284   B = product->B;
2285   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2286     mmdata->reusesym = PETSC_FALSE;
2287     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2289     Cmat = Ccusp->mat;
2290     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2291     Ccsr = (CsrMatrix*)Cmat->mat;
2292     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2293     goto finalize;
2294   }
2295   if (!c->nz) goto finalize;
2296   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2297   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2299   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2300   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2301   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2302   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2303   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2304   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2305   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2306   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2307   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2308   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2309   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2310 
2311   ptype = product->type;
2312   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2313   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2314   switch (ptype) {
2315   case MATPRODUCT_AB:
2316     Amat = Acusp->mat;
2317     Bmat = Bcusp->mat;
2318     break;
2319   case MATPRODUCT_AtB:
2320     Amat = Acusp->matTranspose;
2321     Bmat = Bcusp->mat;
2322     break;
2323   case MATPRODUCT_ABt:
2324     Amat = Acusp->mat;
2325     Bmat = Bcusp->matTranspose;
2326     break;
2327   default:
2328     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2329   }
2330   Cmat = Ccusp->mat;
2331   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2332   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2333   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2334   Acsr = (CsrMatrix*)Amat->mat;
2335   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2336   Ccsr = (CsrMatrix*)Cmat->mat;
2337   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2338   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2339   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2340   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2342   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2343   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2344   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2345     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2346                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2347                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2348                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2349   #else
2350     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2351                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2352                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2353                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2354     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2355                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2356                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2357   #endif
2358 #else
2359   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2360                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2361                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2362                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2363                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2364 #endif
2365   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2366   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2367   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2368   C->offloadmask = PETSC_OFFLOAD_GPU;
2369 finalize:
2370   /* shorter version of MatAssemblyEnd_SeqAIJ */
2371   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2372   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2373   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2374   c->reallocs         = 0;
2375   C->info.mallocs    += 0;
2376   C->info.nz_unneeded = 0;
2377   C->assembled = C->was_assembled = PETSC_TRUE;
2378   C->num_ass++;
2379   PetscFunctionReturn(0);
2380 }
2381 
2382 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2383 {
2384   Mat_Product                  *product = C->product;
2385   Mat                          A,B;
2386   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2387   Mat_SeqAIJ                   *a,*b,*c;
2388   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2389   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2390   PetscInt                     i,j,m,n,k;
2391   PetscBool                    flg;
2392   PetscErrorCode               ierr;
2393   cusparseStatus_t             stat;
2394   cudaError_t                  cerr;
2395   MatProductType               ptype;
2396   MatMatCusparse               *mmdata;
2397   PetscLogDouble               flops;
2398   PetscBool                    biscompressed,ciscompressed;
2399 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2400   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2401   cusparseSpMatDescr_t         BmatSpDescr;
2402 #else
2403   int                          cnz;
2404 #endif
2405   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2406 
2407   PetscFunctionBegin;
2408   MatCheckProduct(C,1);
2409   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2410   A    = product->A;
2411   B    = product->B;
2412   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2413   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2414   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2415   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2416   a = (Mat_SeqAIJ*)A->data;
2417   b = (Mat_SeqAIJ*)B->data;
2418   /* product data */
2419   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2420   C->product->data    = mmdata;
2421   C->product->destroy = MatDestroy_MatMatCusparse;
2422 
2423   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2424   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2425   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2426   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2427   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2428   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2429 
2430   ptype = product->type;
2431   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2432   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2433   biscompressed = PETSC_FALSE;
2434   ciscompressed = PETSC_FALSE;
2435   switch (ptype) {
2436   case MATPRODUCT_AB:
2437     m = A->rmap->n;
2438     n = B->cmap->n;
2439     k = A->cmap->n;
2440     Amat = Acusp->mat;
2441     Bmat = Bcusp->mat;
2442     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2443     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2444     break;
2445   case MATPRODUCT_AtB:
2446     m = A->cmap->n;
2447     n = B->cmap->n;
2448     k = A->rmap->n;
2449     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2450     Amat = Acusp->matTranspose;
2451     Bmat = Bcusp->mat;
2452     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2453     break;
2454   case MATPRODUCT_ABt:
2455     m = A->rmap->n;
2456     n = B->rmap->n;
2457     k = A->cmap->n;
2458     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2459     Amat = Acusp->mat;
2460     Bmat = Bcusp->matTranspose;
2461     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2462     break;
2463   default:
2464     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2465   }
2466 
2467   /* create cusparse matrix */
2468   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2469   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2470   c     = (Mat_SeqAIJ*)C->data;
2471   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2472   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2473   Ccsr  = new CsrMatrix;
2474 
2475   c->compressedrow.use = ciscompressed;
2476   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2477     c->compressedrow.nrows = a->compressedrow.nrows;
2478     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2479     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2480     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2481     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2482     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2483   } else {
2484     c->compressedrow.nrows  = 0;
2485     c->compressedrow.i      = NULL;
2486     c->compressedrow.rindex = NULL;
2487     Ccusp->workVector       = NULL;
2488     Cmat->cprowIndices      = NULL;
2489   }
2490   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2491   Ccusp->mat      = Cmat;
2492   Ccusp->mat->mat = Ccsr;
2493   Ccsr->num_rows    = Ccusp->nrows;
2494   Ccsr->num_cols    = n;
2495   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2496   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2497   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2498   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2499   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2500   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2501   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2502   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2503   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2504   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2505   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2506     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2507     c->nz = 0;
2508     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2509     Ccsr->values = new THRUSTARRAY(c->nz);
2510     goto finalizesym;
2511   }
2512 
2513   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2514   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2515   Acsr = (CsrMatrix*)Amat->mat;
2516   if (!biscompressed) {
2517     Bcsr = (CsrMatrix*)Bmat->mat;
2518 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2519     BmatSpDescr = Bmat->matDescr;
2520 #endif
2521   } else { /* we need to use row offsets for the full matrix */
2522     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2523     Bcsr = new CsrMatrix;
2524     Bcsr->num_rows       = B->rmap->n;
2525     Bcsr->num_cols       = cBcsr->num_cols;
2526     Bcsr->num_entries    = cBcsr->num_entries;
2527     Bcsr->column_indices = cBcsr->column_indices;
2528     Bcsr->values         = cBcsr->values;
2529     if (!Bcusp->rowoffsets_gpu) {
2530       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2531       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2532       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2533     }
2534     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2535     mmdata->Bcsr = Bcsr;
2536 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2537     if (Bcsr->num_rows && Bcsr->num_cols) {
2538       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2539                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2540                                Bcsr->values->data().get(),
2541                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2542                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2543     }
2544     BmatSpDescr = mmdata->matSpBDescr;
2545 #endif
2546   }
2547   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2548   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2549   /* precompute flops count */
2550   if (ptype == MATPRODUCT_AB) {
2551     for (i=0, flops = 0; i<A->rmap->n; i++) {
2552       const PetscInt st = a->i[i];
2553       const PetscInt en = a->i[i+1];
2554       for (j=st; j<en; j++) {
2555         const PetscInt brow = a->j[j];
2556         flops += 2.*(b->i[brow+1] - b->i[brow]);
2557       }
2558     }
2559   } else if (ptype == MATPRODUCT_AtB) {
2560     for (i=0, flops = 0; i<A->rmap->n; i++) {
2561       const PetscInt anzi = a->i[i+1] - a->i[i];
2562       const PetscInt bnzi = b->i[i+1] - b->i[i];
2563       flops += (2.*anzi)*bnzi;
2564     }
2565   } else { /* TODO */
2566     flops = 0.;
2567   }
2568 
2569   mmdata->flops = flops;
2570   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2571 
2572 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2573   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2574   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2575                           NULL, NULL, NULL,
2576                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2577                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2578   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2579  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2580  {
2581   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2582      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2583   */
2584   void*  dBuffer1 = NULL;
2585   void*  dBuffer2 = NULL;
2586   void*  dBuffer3 = NULL;
2587   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2588   size_t bufferSize1 = 0;
2589   size_t bufferSize2 = 0;
2590   size_t bufferSize3 = 0;
2591   size_t bufferSize4 = 0;
2592   size_t bufferSize5 = 0;
2593 
2594   /*----------------------------------------------------------------------*/
2595   /* ask bufferSize1 bytes for external memory */
2596   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2597                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2598                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2599   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2600   /* inspect the matrices A and B to understand the memory requirement for the next step */
2601   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2602                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2603                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2604 
2605   /*----------------------------------------------------------------------*/
2606   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2607                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2608                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2609   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2610   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2611   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2612   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2613                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2614                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2615   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2616   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2617 
2618   /*----------------------------------------------------------------------*/
2619   /* get matrix C non-zero entries C_nnz1 */
2620   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2621   c->nz = (PetscInt) C_nnz1;
2622   /* allocate matrix C */
2623   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2624   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2625   /* update matC with the new pointers */
2626   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2627                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2628 
2629   /*----------------------------------------------------------------------*/
2630   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2631                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2632                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2633   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2634   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2635                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2636                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2637   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2638   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2639                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2640                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2641                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2642   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2643  }
2644  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2645   size_t bufSize2;
2646   /* ask bufferSize bytes for external memory */
2647   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2648                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2649                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2650                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2651   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2652   /* inspect the matrices A and B to understand the memory requirement for the next step */
2653   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2654                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2655                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2656                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2657   /* ask bufferSize again bytes for external memory */
2658   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2659                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2660                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2661                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2662   /* The CUSPARSE documentation is not clear, nor the API
2663      We need both buffers to perform the operations properly!
2664      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2665      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2666      is stored in the descriptor! What a messy API... */
2667   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2668   /* compute the intermediate product of A * B */
2669   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2670                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2671                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2672                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2673   /* get matrix C non-zero entries C_nnz1 */
2674   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2675   c->nz = (PetscInt) C_nnz1;
2676   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2677   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2678   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679   Ccsr->values = new THRUSTARRAY(c->nz);
2680   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2681   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2682                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2683   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2684                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2685                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2686  #endif
2687 #else
2688   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2689   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2690                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2691                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2692                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2693                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2694   c->nz = cnz;
2695   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2696   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2697   Ccsr->values = new THRUSTARRAY(c->nz);
2698   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2699 
2700   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2701   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2702      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2703      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2704   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2705                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2706                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2707                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2708                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2709 #endif
2710   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2711   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2712 finalizesym:
2713   c->singlemalloc = PETSC_FALSE;
2714   c->free_a       = PETSC_TRUE;
2715   c->free_ij      = PETSC_TRUE;
2716   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2717   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2718   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2719     PetscInt *d_i = c->i;
2720     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2721     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2722     ii   = *Ccsr->row_offsets;
2723     jj   = *Ccsr->column_indices;
2724     if (ciscompressed) d_i = c->compressedrow.i;
2725     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2726     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2727   } else {
2728     PetscInt *d_i = c->i;
2729     if (ciscompressed) d_i = c->compressedrow.i;
2730     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2731     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2732   }
2733   if (ciscompressed) { /* need to expand host row offsets */
2734     PetscInt r = 0;
2735     c->i[0] = 0;
2736     for (k = 0; k < c->compressedrow.nrows; k++) {
2737       const PetscInt next = c->compressedrow.rindex[k];
2738       const PetscInt old = c->compressedrow.i[k];
2739       for (; r < next; r++) c->i[r+1] = old;
2740     }
2741     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2742   }
2743   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2744   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2745   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2746   c->maxnz = c->nz;
2747   c->nonzerorowcnt = 0;
2748   c->rmax = 0;
2749   for (k = 0; k < m; k++) {
2750     const PetscInt nn = c->i[k+1] - c->i[k];
2751     c->ilen[k] = c->imax[k] = nn;
2752     c->nonzerorowcnt += (PetscInt)!!nn;
2753     c->rmax = PetscMax(c->rmax,nn);
2754   }
2755   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2756   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2757   Ccsr->num_entries = c->nz;
2758 
2759   C->nonzerostate++;
2760   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2761   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2762   Ccusp->nonzerostate = C->nonzerostate;
2763   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2764   C->preallocated  = PETSC_TRUE;
2765   C->assembled     = PETSC_FALSE;
2766   C->was_assembled = PETSC_FALSE;
2767   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2768     mmdata->reusesym = PETSC_TRUE;
2769     C->offloadmask   = PETSC_OFFLOAD_GPU;
2770   }
2771   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2772   PetscFunctionReturn(0);
2773 }
2774 
2775 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2776 
2777 /* handles sparse or dense B */
2778 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2779 {
2780   Mat_Product    *product = mat->product;
2781   PetscErrorCode ierr;
2782   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2783 
2784   PetscFunctionBegin;
2785   MatCheckProduct(mat,1);
2786   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2787   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2788     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2789   }
2790   if (product->type == MATPRODUCT_ABC) {
2791     Ciscusp = PETSC_FALSE;
2792     if (!product->C->boundtocpu) {
2793       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2794     }
2795   }
2796   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2797     PetscBool usecpu = PETSC_FALSE;
2798     switch (product->type) {
2799     case MATPRODUCT_AB:
2800       if (product->api_user) {
2801         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2802         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2803         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2804       } else {
2805         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2806         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2807         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2808       }
2809       break;
2810     case MATPRODUCT_AtB:
2811       if (product->api_user) {
2812         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2813         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2814         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2815       } else {
2816         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2817         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2818         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2819       }
2820       break;
2821     case MATPRODUCT_PtAP:
2822       if (product->api_user) {
2823         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2824         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2825         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2826       } else {
2827         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2828         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2829         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2830       }
2831       break;
2832     case MATPRODUCT_RARt:
2833       if (product->api_user) {
2834         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2835         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2836         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2837       } else {
2838         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2839         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2840         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2841       }
2842       break;
2843     case MATPRODUCT_ABC:
2844       if (product->api_user) {
2845         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2846         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2847         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2848       } else {
2849         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2850         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2851         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2852       }
2853       break;
2854     default:
2855       break;
2856     }
2857     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2858   }
2859   /* dispatch */
2860   if (isdense) {
2861     switch (product->type) {
2862     case MATPRODUCT_AB:
2863     case MATPRODUCT_AtB:
2864     case MATPRODUCT_ABt:
2865     case MATPRODUCT_PtAP:
2866     case MATPRODUCT_RARt:
2867      if (product->A->boundtocpu) {
2868         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2869       } else {
2870         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2871       }
2872       break;
2873     case MATPRODUCT_ABC:
2874       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2875       break;
2876     default:
2877       break;
2878     }
2879   } else if (Biscusp && Ciscusp) {
2880     switch (product->type) {
2881     case MATPRODUCT_AB:
2882     case MATPRODUCT_AtB:
2883     case MATPRODUCT_ABt:
2884       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2885       break;
2886     case MATPRODUCT_PtAP:
2887     case MATPRODUCT_RARt:
2888     case MATPRODUCT_ABC:
2889       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2890       break;
2891     default:
2892       break;
2893     }
2894   } else { /* fallback for AIJ */
2895     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2896   }
2897   PetscFunctionReturn(0);
2898 }
2899 
2900 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2901 {
2902   PetscErrorCode ierr;
2903 
2904   PetscFunctionBegin;
2905   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2906   PetscFunctionReturn(0);
2907 }
2908 
2909 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2910 {
2911   PetscErrorCode ierr;
2912 
2913   PetscFunctionBegin;
2914   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2915   PetscFunctionReturn(0);
2916 }
2917 
2918 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2919 {
2920   PetscErrorCode ierr;
2921 
2922   PetscFunctionBegin;
2923   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2924   PetscFunctionReturn(0);
2925 }
2926 
2927 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2928 {
2929   PetscErrorCode ierr;
2930 
2931   PetscFunctionBegin;
2932   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2933   PetscFunctionReturn(0);
2934 }
2935 
2936 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2937 {
2938   PetscErrorCode ierr;
2939 
2940   PetscFunctionBegin;
2941   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2942   PetscFunctionReturn(0);
2943 }
2944 
2945 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2946 {
2947   int i = blockIdx.x*blockDim.x + threadIdx.x;
2948   if (i < n) y[idx[i]] += x[i];
2949 }
2950 
2951 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2952 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2953 {
2954   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2955   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2956   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2957   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2958   PetscErrorCode               ierr;
2959   cusparseStatus_t             stat;
2960   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2961   PetscBool                    compressed;
2962 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2963   PetscInt                     nx,ny;
2964 #endif
2965 
2966   PetscFunctionBegin;
2967   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2968   if (!a->nonzerorowcnt) {
2969     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2970     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2971     PetscFunctionReturn(0);
2972   }
2973   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2974   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2975   if (!trans) {
2976     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2977     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2978   } else {
2979     if (herm || !A->form_explicit_transpose) {
2980       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2981       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2982     } else {
2983       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
2984       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2985     }
2986   }
2987   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2988   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2989 
2990   try {
2991     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2992     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2993     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2994 
2995     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2996     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2997       /* z = A x + beta y.
2998          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2999          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3000       */
3001       xptr = xarray;
3002       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3003       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3004      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3005       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3006           allocated to accommodate different uses. So we get the length info directly from mat.
3007        */
3008       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3009         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3010         nx = mat->num_cols;
3011         ny = mat->num_rows;
3012       }
3013      #endif
3014     } else {
3015       /* z = A^T x + beta y
3016          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3017          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3018        */
3019       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3020       dptr = zarray;
3021       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3022       if (compressed) { /* Scatter x to work vector */
3023         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3024         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3025                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3026                          VecCUDAEqualsReverse());
3027       }
3028      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3029       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3030         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3031         nx = mat->num_rows;
3032         ny = mat->num_cols;
3033       }
3034      #endif
3035     }
3036 
3037     /* csr_spmv does y = alpha op(A) x + beta y */
3038     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3039      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3040       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3041       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3042         cudaError_t cerr;
3043         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3044         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3045         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3046                                 matstruct->matDescr,
3047                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3048                                 matstruct->cuSpMV[opA].vecYDescr,
3049                                 cusparse_scalartype,
3050                                 cusparsestruct->spmvAlg,
3051                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3052         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3053 
3054         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3055       } else {
3056         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3057         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3058         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3059       }
3060 
3061       stat = cusparseSpMV(cusparsestruct->handle, opA,
3062                                matstruct->alpha_one,
3063                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3064                                matstruct->cuSpMV[opA].vecXDescr,
3065                                beta,
3066                                matstruct->cuSpMV[opA].vecYDescr,
3067                                cusparse_scalartype,
3068                                cusparsestruct->spmvAlg,
3069                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3070      #else
3071       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3072       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3073                                mat->num_rows, mat->num_cols,
3074                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3075                                mat->values->data().get(), mat->row_offsets->data().get(),
3076                                mat->column_indices->data().get(), xptr, beta,
3077                                dptr);CHKERRCUSPARSE(stat);
3078      #endif
3079     } else {
3080       if (cusparsestruct->nrows) {
3081        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3082         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3083        #else
3084         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3085         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3086                                  matstruct->alpha_one, matstruct->descr, hybMat,
3087                                  xptr, beta,
3088                                  dptr);CHKERRCUSPARSE(stat);
3089        #endif
3090       }
3091     }
3092     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3093 
3094     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3095       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3096         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3097           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3098         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3099           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3100         }
3101       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3102         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3103       }
3104 
3105       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3106       if (compressed) {
3107         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3108         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3109            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3110            prevent that. So I just add a ScatterAdd kernel.
3111          */
3112        #if 0
3113         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3114         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3115                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3116                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3117                          VecCUDAPlusEquals());
3118        #else
3119         PetscInt n = matstruct->cprowIndices->size();
3120         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3121        #endif
3122         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3123       }
3124     } else {
3125       if (yy && yy != zz) {
3126         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3127       }
3128     }
3129     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3130     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3131     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3132   } catch(char *ex) {
3133     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3134   }
3135   if (yy) {
3136     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3137   } else {
3138     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3139   }
3140   PetscFunctionReturn(0);
3141 }
3142 
3143 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3144 {
3145   PetscErrorCode ierr;
3146 
3147   PetscFunctionBegin;
3148   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3149   PetscFunctionReturn(0);
3150 }
3151 
3152 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3153 {
3154   PetscErrorCode     ierr;
3155   PetscObjectState   onnz = A->nonzerostate;
3156   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3157 
3158   PetscFunctionBegin;
3159   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3160   if (onnz != A->nonzerostate && cusp->deviceMat) {
3161     cudaError_t cerr;
3162 
3163     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3164     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3165     cusp->deviceMat = NULL;
3166   }
3167   PetscFunctionReturn(0);
3168 }
3169 
3170 /* --------------------------------------------------------------------------------*/
3171 /*@
3172    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3173    (the default parallel PETSc format). This matrix will ultimately pushed down
3174    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3175    assembly performance the user should preallocate the matrix storage by setting
3176    the parameter nz (or the array nnz).  By setting these parameters accurately,
3177    performance during matrix assembly can be increased by more than a factor of 50.
3178 
3179    Collective
3180 
3181    Input Parameters:
3182 +  comm - MPI communicator, set to PETSC_COMM_SELF
3183 .  m - number of rows
3184 .  n - number of columns
3185 .  nz - number of nonzeros per row (same for all rows)
3186 -  nnz - array containing the number of nonzeros in the various rows
3187          (possibly different for each row) or NULL
3188 
3189    Output Parameter:
3190 .  A - the matrix
3191 
3192    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3193    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3194    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3195 
3196    Notes:
3197    If nnz is given then nz is ignored
3198 
3199    The AIJ format (also called the Yale sparse matrix format or
3200    compressed row storage), is fully compatible with standard Fortran 77
3201    storage.  That is, the stored row and column indices can begin at
3202    either one (as in Fortran) or zero.  See the users' manual for details.
3203 
3204    Specify the preallocated storage with either nz or nnz (not both).
3205    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3206    allocation.  For large problems you MUST preallocate memory or you
3207    will get TERRIBLE performance, see the users' manual chapter on matrices.
3208 
3209    By default, this format uses inodes (identical nodes) when possible, to
3210    improve numerical efficiency of matrix-vector products and solves. We
3211    search for consecutive rows with the same nonzero structure, thereby
3212    reusing matrix information to achieve increased efficiency.
3213 
3214    Level: intermediate
3215 
3216 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3217 @*/
3218 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3219 {
3220   PetscErrorCode ierr;
3221 
3222   PetscFunctionBegin;
3223   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3224   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3225   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3226   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3227   PetscFunctionReturn(0);
3228 }
3229 
3230 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3231 {
3232   PetscErrorCode ierr;
3233 
3234   PetscFunctionBegin;
3235   if (A->factortype == MAT_FACTOR_NONE) {
3236     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3237   } else {
3238     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3239   }
3240   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3241   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3242   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3243   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3244   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3245   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3246   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3247   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3248   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3249   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3254 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3255 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3256 {
3257   PetscErrorCode ierr;
3258 
3259   PetscFunctionBegin;
3260   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3261   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3262   PetscFunctionReturn(0);
3263 }
3264 
3265 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3266 {
3267   PetscErrorCode     ierr;
3268   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3269   Mat_SeqAIJCUSPARSE *cy;
3270   Mat_SeqAIJCUSPARSE *cx;
3271   PetscScalar        *ay;
3272   const PetscScalar  *ax;
3273   CsrMatrix          *csry,*csrx;
3274 
3275   PetscFunctionBegin;
3276   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3277   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3278   if (X->ops->axpy != Y->ops->axpy) {
3279     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3280     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3281     PetscFunctionReturn(0);
3282   }
3283   /* if we are here, it means both matrices are bound to GPU */
3284   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3285   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3286   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3287   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3288   csry = (CsrMatrix*)cy->mat->mat;
3289   csrx = (CsrMatrix*)cx->mat->mat;
3290   /* see if we can turn this into a cublas axpy */
3291   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3292     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3293     if (eq) {
3294       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3295     }
3296     if (eq) str = SAME_NONZERO_PATTERN;
3297   }
3298   /* spgeam is buggy with one column */
3299   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3300 
3301   if (str == SUBSET_NONZERO_PATTERN) {
3302     cusparseStatus_t stat;
3303     PetscScalar      b = 1.0;
3304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3305     size_t           bufferSize;
3306     void             *buffer;
3307     cudaError_t      cerr;
3308 #endif
3309 
3310     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3311     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3312     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3313 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3314     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3315                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3316                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3317                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3318     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3319     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3320     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3321                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3322                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3323                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3324     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3325     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3326     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3327 #else
3328     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3329     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3330                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3331                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3332                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3333     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3334     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3335 #endif
3336     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3337     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3338     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3339     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3340   } else if (str == SAME_NONZERO_PATTERN) {
3341     cublasHandle_t cublasv2handle;
3342     cublasStatus_t berr;
3343     PetscBLASInt   one = 1, bnz = 1;
3344 
3345     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3346     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3347     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3348     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3349     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3350     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3351     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3352     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3353     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3354     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3355     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3356   } else {
3357     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3358     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3359   }
3360   PetscFunctionReturn(0);
3361 }
3362 
3363 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3364 {
3365   PetscErrorCode ierr;
3366   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3367   PetscScalar    *ay;
3368   cublasHandle_t cublasv2handle;
3369   cublasStatus_t berr;
3370   PetscBLASInt   one = 1, bnz = 1;
3371 
3372   PetscFunctionBegin;
3373   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3374   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3375   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3376   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3377   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3378   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3379   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3380   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3381   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3382   PetscFunctionReturn(0);
3383 }
3384 
3385 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3386 {
3387   PetscErrorCode ierr;
3388   PetscBool      both = PETSC_FALSE;
3389   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3390 
3391   PetscFunctionBegin;
3392   if (A->factortype == MAT_FACTOR_NONE) {
3393     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3394     if (spptr->mat) {
3395       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3396       if (matrix->values) {
3397         both = PETSC_TRUE;
3398         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3399       }
3400     }
3401     if (spptr->matTranspose) {
3402       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3403       if (matrix->values) {
3404         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3405       }
3406     }
3407   }
3408   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3409   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3410   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3411   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3412   else A->offloadmask = PETSC_OFFLOAD_CPU;
3413   PetscFunctionReturn(0);
3414 }
3415 
3416 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3417 {
3418   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3419   PetscErrorCode ierr;
3420 
3421   PetscFunctionBegin;
3422   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3423   if (flg) {
3424     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3425 
3426     A->ops->scale                     = MatScale_SeqAIJ;
3427     A->ops->axpy                      = MatAXPY_SeqAIJ;
3428     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3429     A->ops->mult                      = MatMult_SeqAIJ;
3430     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3431     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3432     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3433     A->ops->multhermitiantranspose    = NULL;
3434     A->ops->multhermitiantransposeadd = NULL;
3435     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3436     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3437     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3438     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3439     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3440     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3441     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3442     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3443   } else {
3444     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3445     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3446     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3447     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3448     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3449     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3450     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3451     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3452     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3453     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3454     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3455     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3456     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3457     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3458     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3459     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3460     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3461   }
3462   A->boundtocpu = flg;
3463   a->inode.use = flg;
3464   PetscFunctionReturn(0);
3465 }
3466 
3467 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3468 {
3469   PetscErrorCode   ierr;
3470   cusparseStatus_t stat;
3471   Mat              B;
3472 
3473   PetscFunctionBegin;
3474   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3475   if (reuse == MAT_INITIAL_MATRIX) {
3476     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3477   } else if (reuse == MAT_REUSE_MATRIX) {
3478     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3479   }
3480   B = *newmat;
3481 
3482   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3483   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3484 
3485   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3486     if (B->factortype == MAT_FACTOR_NONE) {
3487       Mat_SeqAIJCUSPARSE *spptr;
3488       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3489       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3490       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3491       spptr->format     = MAT_CUSPARSE_CSR;
3492      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3493      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3494       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3495      #else
3496       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3497      #endif
3498       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3499       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3500      #endif
3501       B->spptr = spptr;
3502     } else {
3503       Mat_SeqAIJCUSPARSETriFactors *spptr;
3504 
3505       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3506       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3507       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3508       B->spptr = spptr;
3509     }
3510     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3511   }
3512   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3513   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3514   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3515   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3516   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3517   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3518 
3519   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3520   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3521   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3522 #if defined(PETSC_HAVE_HYPRE)
3523   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3524 #endif
3525   PetscFunctionReturn(0);
3526 }
3527 
3528 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3529 {
3530   PetscErrorCode ierr;
3531 
3532   PetscFunctionBegin;
3533   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3534   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3535   PetscFunctionReturn(0);
3536 }
3537 
3538 /*MC
3539    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3540 
3541    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3542    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3543    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3544 
3545    Options Database Keys:
3546 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3547 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3548 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3549 
3550   Level: beginner
3551 
3552 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3553 M*/
3554 
3555 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3556 
3557 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3558 {
3559   PetscErrorCode ierr;
3560 
3561   PetscFunctionBegin;
3562   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3563   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3564   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3565   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3566   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3567 
3568   PetscFunctionReturn(0);
3569 }
3570 
3571 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3572 {
3573   PetscErrorCode   ierr;
3574   cusparseStatus_t stat;
3575 
3576   PetscFunctionBegin;
3577   if (*cusparsestruct) {
3578     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3579     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3580     delete (*cusparsestruct)->workVector;
3581     delete (*cusparsestruct)->rowoffsets_gpu;
3582     delete (*cusparsestruct)->cooPerm;
3583     delete (*cusparsestruct)->cooPerm_a;
3584     delete (*cusparsestruct)->csr2csc_i;
3585     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3586     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3587   }
3588   PetscFunctionReturn(0);
3589 }
3590 
3591 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3592 {
3593   PetscFunctionBegin;
3594   if (*mat) {
3595     delete (*mat)->values;
3596     delete (*mat)->column_indices;
3597     delete (*mat)->row_offsets;
3598     delete *mat;
3599     *mat = 0;
3600   }
3601   PetscFunctionReturn(0);
3602 }
3603 
3604 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3605 {
3606   cusparseStatus_t stat;
3607   PetscErrorCode   ierr;
3608 
3609   PetscFunctionBegin;
3610   if (*trifactor) {
3611     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3612     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3613     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3614     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3615     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3616    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3617     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3618    #endif
3619     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3620   }
3621   PetscFunctionReturn(0);
3622 }
3623 
3624 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3625 {
3626   CsrMatrix        *mat;
3627   cusparseStatus_t stat;
3628   cudaError_t      err;
3629 
3630   PetscFunctionBegin;
3631   if (*matstruct) {
3632     if ((*matstruct)->mat) {
3633       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3634        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3635         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3636        #else
3637         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3638         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3639        #endif
3640       } else {
3641         mat = (CsrMatrix*)(*matstruct)->mat;
3642         CsrMatrix_Destroy(&mat);
3643       }
3644     }
3645     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3646     delete (*matstruct)->cprowIndices;
3647     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3648     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3649     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3650 
3651    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3652     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3653     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3654     for (int i=0; i<3; i++) {
3655       if (mdata->cuSpMV[i].initialized) {
3656         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3657         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3658         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3659       }
3660     }
3661    #endif
3662     delete *matstruct;
3663     *matstruct = NULL;
3664   }
3665   PetscFunctionReturn(0);
3666 }
3667 
3668 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3669 {
3670   PetscErrorCode ierr;
3671 
3672   PetscFunctionBegin;
3673   if (*trifactors) {
3674     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3675     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3676     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3677     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3678     delete (*trifactors)->rpermIndices;
3679     delete (*trifactors)->cpermIndices;
3680     delete (*trifactors)->workVector;
3681     (*trifactors)->rpermIndices = NULL;
3682     (*trifactors)->cpermIndices = NULL;
3683     (*trifactors)->workVector = NULL;
3684     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3685     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3686     (*trifactors)->init_dev_prop = PETSC_FALSE;
3687   }
3688   PetscFunctionReturn(0);
3689 }
3690 
3691 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3692 {
3693   PetscErrorCode   ierr;
3694   cusparseHandle_t handle;
3695   cusparseStatus_t stat;
3696 
3697   PetscFunctionBegin;
3698   if (*trifactors) {
3699     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3700     if (handle = (*trifactors)->handle) {
3701       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3702     }
3703     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3704   }
3705   PetscFunctionReturn(0);
3706 }
3707 
3708 struct IJCompare
3709 {
3710   __host__ __device__
3711   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3712   {
3713     if (t1.get<0>() < t2.get<0>()) return true;
3714     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3715     return false;
3716   }
3717 };
3718 
3719 struct IJEqual
3720 {
3721   __host__ __device__
3722   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3723   {
3724     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3725     return true;
3726   }
3727 };
3728 
3729 struct IJDiff
3730 {
3731   __host__ __device__
3732   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3733   {
3734     return t1 == t2 ? 0 : 1;
3735   }
3736 };
3737 
3738 struct IJSum
3739 {
3740   __host__ __device__
3741   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3742   {
3743     return t1||t2;
3744   }
3745 };
3746 
3747 #include <thrust/iterator/discard_iterator.h>
3748 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3749 {
3750   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3751   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3752   THRUSTARRAY                           *cooPerm_v = NULL;
3753   thrust::device_ptr<const PetscScalar> d_v;
3754   CsrMatrix                             *matrix;
3755   PetscErrorCode                        ierr;
3756   PetscInt                              n;
3757 
3758   PetscFunctionBegin;
3759   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3760   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3761   if (!cusp->cooPerm) {
3762     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3763     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3764     PetscFunctionReturn(0);
3765   }
3766   matrix = (CsrMatrix*)cusp->mat->mat;
3767   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3768   if (!v) {
3769     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3770     goto finalize;
3771   }
3772   n = cusp->cooPerm->size();
3773   if (isCudaMem(v)) {
3774     d_v = thrust::device_pointer_cast(v);
3775   } else {
3776     cooPerm_v = new THRUSTARRAY(n);
3777     cooPerm_v->assign(v,v+n);
3778     d_v = cooPerm_v->data();
3779     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3780   }
3781   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3782   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3783     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3784       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3785       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3786       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3787         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3788         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3789       */
3790       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3791       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3792       delete cooPerm_w;
3793     } else {
3794       /* all nonzeros in d_v[] are unique entries */
3795       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3796                                                                 matrix->values->begin()));
3797       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3798                                                                 matrix->values->end()));
3799       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3800     }
3801   } else {
3802     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3803       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3804       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3805     } else {
3806       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3807                                                                 matrix->values->begin()));
3808       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3809                                                                 matrix->values->end()));
3810       thrust::for_each(zibit,zieit,VecCUDAEquals());
3811     }
3812   }
3813   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3814 finalize:
3815   delete cooPerm_v;
3816   A->offloadmask = PETSC_OFFLOAD_GPU;
3817   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3818   /* shorter version of MatAssemblyEnd_SeqAIJ */
3819   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3820   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3821   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3822   a->reallocs         = 0;
3823   A->info.mallocs    += 0;
3824   A->info.nz_unneeded = 0;
3825   A->assembled = A->was_assembled = PETSC_TRUE;
3826   A->num_ass++;
3827   PetscFunctionReturn(0);
3828 }
3829 
3830 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3831 {
3832   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3833   PetscErrorCode     ierr;
3834 
3835   PetscFunctionBegin;
3836   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3837   if (!cusp) PetscFunctionReturn(0);
3838   if (destroy) {
3839     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3840     delete cusp->csr2csc_i;
3841     cusp->csr2csc_i = NULL;
3842   }
3843   A->transupdated = PETSC_FALSE;
3844   PetscFunctionReturn(0);
3845 }
3846 
3847 #include <thrust/binary_search.h>
3848 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3849 {
3850   PetscErrorCode     ierr;
3851   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3852   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3853   PetscInt           cooPerm_n, nzr = 0;
3854   cudaError_t        cerr;
3855 
3856   PetscFunctionBegin;
3857   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3858   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3859   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3860   if (n != cooPerm_n) {
3861     delete cusp->cooPerm;
3862     delete cusp->cooPerm_a;
3863     cusp->cooPerm = NULL;
3864     cusp->cooPerm_a = NULL;
3865   }
3866   if (n) {
3867     THRUSTINTARRAY d_i(n);
3868     THRUSTINTARRAY d_j(n);
3869     THRUSTINTARRAY ii(A->rmap->n);
3870 
3871     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3872     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3873 
3874     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3875     d_i.assign(coo_i,coo_i+n);
3876     d_j.assign(coo_j,coo_j+n);
3877 
3878     /* Ex.
3879       n = 6
3880       coo_i = [3,3,1,4,1,4]
3881       coo_j = [3,2,2,5,2,6]
3882     */
3883     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3884     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3885 
3886     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3887     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3888     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3889     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3890     THRUSTINTARRAY w = d_j;
3891 
3892     /*
3893       d_i     = [1,1,3,3,4,4]
3894       d_j     = [2,2,2,3,5,6]
3895       cooPerm = [2,4,1,0,3,5]
3896     */
3897     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3898 
3899     /*
3900       d_i     = [1,3,3,4,4,x]
3901                             ^ekey
3902       d_j     = [2,2,3,5,6,x]
3903                            ^nekye
3904     */
3905     if (nekey == ekey) { /* all entries are unique */
3906       delete cusp->cooPerm_a;
3907       cusp->cooPerm_a = NULL;
3908     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3909       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3910       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3911       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3912       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3913       w[0] = 0;
3914       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3915       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3916     }
3917     thrust::counting_iterator<PetscInt> search_begin(0);
3918     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3919                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3920                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3921     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3922 
3923     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3924     a->singlemalloc = PETSC_FALSE;
3925     a->free_a       = PETSC_TRUE;
3926     a->free_ij      = PETSC_TRUE;
3927     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3928     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3929     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3930     a->nz = a->maxnz = a->i[A->rmap->n];
3931     a->rmax = 0;
3932     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3933     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3934     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3935     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3936     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3937     for (PetscInt i = 0; i < A->rmap->n; i++) {
3938       const PetscInt nnzr = a->i[i+1] - a->i[i];
3939       nzr += (PetscInt)!!(nnzr);
3940       a->ilen[i] = a->imax[i] = nnzr;
3941       a->rmax = PetscMax(a->rmax,nnzr);
3942     }
3943     a->nonzerorowcnt = nzr;
3944     A->preallocated = PETSC_TRUE;
3945     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3946     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3947   } else {
3948     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3949   }
3950   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3951 
3952   /* We want to allocate the CUSPARSE struct for matvec now.
3953      The code is so convoluted now that I prefer to copy zeros */
3954   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3955   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3956   A->offloadmask = PETSC_OFFLOAD_CPU;
3957   A->nonzerostate++;
3958   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3959   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3960 
3961   A->assembled = PETSC_FALSE;
3962   A->was_assembled = PETSC_FALSE;
3963   PetscFunctionReturn(0);
3964 }
3965 
3966 /*@C
3967     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3968 
3969    Not collective
3970 
3971     Input Parameters:
3972 +   A - the matrix
3973 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
3974 
3975     Output Parameters:
3976 +   ia - the CSR row pointers
3977 -   ja - the CSR column indices
3978 
3979     Level: developer
3980 
3981     Notes:
3982       When compressed is true, the CSR structure does not contain empty rows
3983 
3984 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3985 @*/
3986 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3987 {
3988   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3989   CsrMatrix          *csr;
3990   PetscErrorCode     ierr;
3991   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3992 
3993   PetscFunctionBegin;
3994   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3995   if (!i || !j) PetscFunctionReturn(0);
3996   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3997   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3998   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3999   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4000   csr = (CsrMatrix*)cusp->mat->mat;
4001   if (i) {
4002     if (!compressed && a->compressedrow.use) { /* need full row offset */
4003       if (!cusp->rowoffsets_gpu) {
4004         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4005         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4006         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4007       }
4008       *i = cusp->rowoffsets_gpu->data().get();
4009     } else *i = csr->row_offsets->data().get();
4010   }
4011   if (j) *j = csr->column_indices->data().get();
4012   PetscFunctionReturn(0);
4013 }
4014 
4015 /*@C
4016     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4017 
4018    Not collective
4019 
4020     Input Parameters:
4021 +   A - the matrix
4022 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4023 
4024     Output Parameters:
4025 +   ia - the CSR row pointers
4026 -   ja - the CSR column indices
4027 
4028     Level: developer
4029 
4030 .seealso: MatSeqAIJCUSPARSEGetIJ()
4031 @*/
4032 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4033 {
4034   PetscFunctionBegin;
4035   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4036   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4037   if (i) *i = NULL;
4038   if (j) *j = NULL;
4039   PetscFunctionReturn(0);
4040 }
4041 
4042 /*@C
4043    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4044 
4045    Not Collective
4046 
4047    Input Parameter:
4048 .   A - a MATSEQAIJCUSPARSE matrix
4049 
4050    Output Parameter:
4051 .   a - pointer to the device data
4052 
4053    Level: developer
4054 
4055    Notes: may trigger host-device copies if up-to-date matrix data is on host
4056 
4057 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4058 @*/
4059 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4060 {
4061   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4062   CsrMatrix          *csr;
4063   PetscErrorCode     ierr;
4064 
4065   PetscFunctionBegin;
4066   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4067   PetscValidPointer(a,2);
4068   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4069   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4070   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4071   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4072   csr = (CsrMatrix*)cusp->mat->mat;
4073   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4074   *a = csr->values->data().get();
4075   PetscFunctionReturn(0);
4076 }
4077 
4078 /*@C
4079    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4080 
4081    Not Collective
4082 
4083    Input Parameter:
4084 .   A - a MATSEQAIJCUSPARSE matrix
4085 
4086    Output Parameter:
4087 .   a - pointer to the device data
4088 
4089    Level: developer
4090 
4091 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4092 @*/
4093 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4094 {
4095   PetscFunctionBegin;
4096   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4097   PetscValidPointer(a,2);
4098   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4099   *a = NULL;
4100   PetscFunctionReturn(0);
4101 }
4102 
4103 /*@C
4104    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4105 
4106    Not Collective
4107 
4108    Input Parameter:
4109 .   A - a MATSEQAIJCUSPARSE matrix
4110 
4111    Output Parameter:
4112 .   a - pointer to the device data
4113 
4114    Level: developer
4115 
4116    Notes: may trigger host-device copies if up-to-date matrix data is on host
4117 
4118 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4119 @*/
4120 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4121 {
4122   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4123   CsrMatrix          *csr;
4124   PetscErrorCode     ierr;
4125 
4126   PetscFunctionBegin;
4127   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4128   PetscValidPointer(a,2);
4129   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4130   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4131   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4132   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4133   csr = (CsrMatrix*)cusp->mat->mat;
4134   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4135   *a = csr->values->data().get();
4136   A->offloadmask = PETSC_OFFLOAD_GPU;
4137   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4138   PetscFunctionReturn(0);
4139 }
4140 /*@C
4141    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4142 
4143    Not Collective
4144 
4145    Input Parameter:
4146 .   A - a MATSEQAIJCUSPARSE matrix
4147 
4148    Output Parameter:
4149 .   a - pointer to the device data
4150 
4151    Level: developer
4152 
4153 .seealso: MatSeqAIJCUSPARSEGetArray()
4154 @*/
4155 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4156 {
4157   PetscErrorCode ierr;
4158 
4159   PetscFunctionBegin;
4160   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4161   PetscValidPointer(a,2);
4162   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4163   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4164   *a = NULL;
4165   PetscFunctionReturn(0);
4166 }
4167 
4168 /*@C
4169    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4170 
4171    Not Collective
4172 
4173    Input Parameter:
4174 .   A - a MATSEQAIJCUSPARSE matrix
4175 
4176    Output Parameter:
4177 .   a - pointer to the device data
4178 
4179    Level: developer
4180 
4181    Notes: does not trigger host-device copies and flags data validity on the GPU
4182 
4183 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4184 @*/
4185 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4186 {
4187   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4188   CsrMatrix          *csr;
4189   PetscErrorCode     ierr;
4190 
4191   PetscFunctionBegin;
4192   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4193   PetscValidPointer(a,2);
4194   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4195   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4196   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4197   csr = (CsrMatrix*)cusp->mat->mat;
4198   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4199   *a = csr->values->data().get();
4200   A->offloadmask = PETSC_OFFLOAD_GPU;
4201   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4202   PetscFunctionReturn(0);
4203 }
4204 
4205 /*@C
4206    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4207 
4208    Not Collective
4209 
4210    Input Parameter:
4211 .   A - a MATSEQAIJCUSPARSE matrix
4212 
4213    Output Parameter:
4214 .   a - pointer to the device data
4215 
4216    Level: developer
4217 
4218 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4219 @*/
4220 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4221 {
4222   PetscErrorCode ierr;
4223 
4224   PetscFunctionBegin;
4225   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4226   PetscValidPointer(a,2);
4227   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4228   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4229   *a = NULL;
4230   PetscFunctionReturn(0);
4231 }
4232 
4233 struct IJCompare4
4234 {
4235   __host__ __device__
4236   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4237   {
4238     if (t1.get<0>() < t2.get<0>()) return true;
4239     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4240     return false;
4241   }
4242 };
4243 
4244 struct Shift
4245 {
4246   int _shift;
4247 
4248   Shift(int shift) : _shift(shift) {}
4249   __host__ __device__
4250   inline int operator() (const int &c)
4251   {
4252     return c + _shift;
4253   }
4254 };
4255 
4256 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4257 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4258 {
4259   PetscErrorCode               ierr;
4260   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4261   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4262   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4263   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4264   PetscInt                     Annz,Bnnz;
4265   cusparseStatus_t             stat;
4266   PetscInt                     i,m,n,zero = 0;
4267   cudaError_t                  cerr;
4268 
4269   PetscFunctionBegin;
4270   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4271   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4272   PetscValidPointer(C,4);
4273   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4274   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4275   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4276   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4277   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4278   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4279   if (reuse == MAT_INITIAL_MATRIX) {
4280     m     = A->rmap->n;
4281     n     = A->cmap->n + B->cmap->n;
4282     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4283     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4284     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4285     c     = (Mat_SeqAIJ*)(*C)->data;
4286     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4287     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4288     Ccsr  = new CsrMatrix;
4289     Cmat->cprowIndices      = NULL;
4290     c->compressedrow.use    = PETSC_FALSE;
4291     c->compressedrow.nrows  = 0;
4292     c->compressedrow.i      = NULL;
4293     c->compressedrow.rindex = NULL;
4294     Ccusp->workVector       = NULL;
4295     Ccusp->nrows    = m;
4296     Ccusp->mat      = Cmat;
4297     Ccusp->mat->mat = Ccsr;
4298     Ccsr->num_rows  = m;
4299     Ccsr->num_cols  = n;
4300     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4301     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4302     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4303     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4304     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4305     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4306     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4307     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4308     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4309     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4310     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4311     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4312     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4313 
4314     Acsr = (CsrMatrix*)Acusp->mat->mat;
4315     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4316     Annz = (PetscInt)Acsr->column_indices->size();
4317     Bnnz = (PetscInt)Bcsr->column_indices->size();
4318     c->nz = Annz + Bnnz;
4319     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4320     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4321     Ccsr->values = new THRUSTARRAY(c->nz);
4322     Ccsr->num_entries = c->nz;
4323     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4324     if (c->nz) {
4325       auto Acoo = new THRUSTINTARRAY32(Annz);
4326       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4327       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4328       THRUSTINTARRAY32 *Aroff,*Broff;
4329 
4330       if (a->compressedrow.use) { /* need full row offset */
4331         if (!Acusp->rowoffsets_gpu) {
4332           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4333           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4334           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4335         }
4336         Aroff = Acusp->rowoffsets_gpu;
4337       } else Aroff = Acsr->row_offsets;
4338       if (b->compressedrow.use) { /* need full row offset */
4339         if (!Bcusp->rowoffsets_gpu) {
4340           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4341           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4342           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4343         }
4344         Broff = Bcusp->rowoffsets_gpu;
4345       } else Broff = Bcsr->row_offsets;
4346       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4347       stat = cusparseXcsr2coo(Acusp->handle,
4348                               Aroff->data().get(),
4349                               Annz,
4350                               m,
4351                               Acoo->data().get(),
4352                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4353       stat = cusparseXcsr2coo(Bcusp->handle,
4354                               Broff->data().get(),
4355                               Bnnz,
4356                               m,
4357                               Bcoo->data().get(),
4358                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4359       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4360       auto Aperm = thrust::make_constant_iterator(1);
4361       auto Bperm = thrust::make_constant_iterator(0);
4362 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4363       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4364       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4365 #else
4366       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4367       auto Bcib = Bcsr->column_indices->begin();
4368       auto Bcie = Bcsr->column_indices->end();
4369       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4370 #endif
4371       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4372       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4373       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4374       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4375       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4376       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4377       auto p1 = Ccusp->cooPerm->begin();
4378       auto p2 = Ccusp->cooPerm->begin();
4379       thrust::advance(p2,Annz);
4380       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4381 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4382       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4383 #endif
4384       auto cci = thrust::make_counting_iterator(zero);
4385       auto cce = thrust::make_counting_iterator(c->nz);
4386 #if 0 //Errors on SUMMIT cuda 11.1.0
4387       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4388 #else
4389       auto pred = thrust::identity<int>();
4390       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4391       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4392 #endif
4393       stat = cusparseXcoo2csr(Ccusp->handle,
4394                               Ccoo->data().get(),
4395                               c->nz,
4396                               m,
4397                               Ccsr->row_offsets->data().get(),
4398                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4399       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4400       delete wPerm;
4401       delete Acoo;
4402       delete Bcoo;
4403       delete Ccoo;
4404 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4405       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4406                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4407                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4408                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4409 #endif
4410       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4411         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4412         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4413         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4414         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4415         CsrMatrix *CcsrT = new CsrMatrix;
4416         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4417         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4418 
4419         (*C)->form_explicit_transpose = PETSC_TRUE;
4420         (*C)->transupdated = PETSC_TRUE;
4421         Ccusp->rowoffsets_gpu = NULL;
4422         CmatT->cprowIndices = NULL;
4423         CmatT->mat = CcsrT;
4424         CcsrT->num_rows = n;
4425         CcsrT->num_cols = m;
4426         CcsrT->num_entries = c->nz;
4427 
4428         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4429         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4430         CcsrT->values = new THRUSTARRAY(c->nz);
4431 
4432         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4433         auto rT = CcsrT->row_offsets->begin();
4434         if (AT) {
4435           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4436           thrust::advance(rT,-1);
4437         }
4438         if (BT) {
4439           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4440           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4441           thrust::copy(titb,tite,rT);
4442         }
4443         auto cT = CcsrT->column_indices->begin();
4444         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4445         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4446         auto vT = CcsrT->values->begin();
4447         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4448         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4449         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4450 
4451         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4452         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4453         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4454         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4455         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4456         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4457         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4458         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4459         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4460 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4461         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4462                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4463                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4464                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4465 #endif
4466         Ccusp->matTranspose = CmatT;
4467       }
4468     }
4469 
4470     c->singlemalloc = PETSC_FALSE;
4471     c->free_a       = PETSC_TRUE;
4472     c->free_ij      = PETSC_TRUE;
4473     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4474     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4475     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4476       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4477       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4478       ii   = *Ccsr->row_offsets;
4479       jj   = *Ccsr->column_indices;
4480       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4481       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4482     } else {
4483       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4484       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4485     }
4486     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4487     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4488     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4489     c->maxnz = c->nz;
4490     c->nonzerorowcnt = 0;
4491     c->rmax = 0;
4492     for (i = 0; i < m; i++) {
4493       const PetscInt nn = c->i[i+1] - c->i[i];
4494       c->ilen[i] = c->imax[i] = nn;
4495       c->nonzerorowcnt += (PetscInt)!!nn;
4496       c->rmax = PetscMax(c->rmax,nn);
4497     }
4498     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4499     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4500     (*C)->nonzerostate++;
4501     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4502     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4503     Ccusp->nonzerostate = (*C)->nonzerostate;
4504     (*C)->preallocated  = PETSC_TRUE;
4505   } else {
4506     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4507     c = (Mat_SeqAIJ*)(*C)->data;
4508     if (c->nz) {
4509       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4510       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4511       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4512       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4513       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4514       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4515       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4516       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4517       Acsr = (CsrMatrix*)Acusp->mat->mat;
4518       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4519       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4520       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4521       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4522       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4523       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4524       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4525       auto pmid = Ccusp->cooPerm->begin();
4526       thrust::advance(pmid,Acsr->num_entries);
4527       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4528       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4529                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4530       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4531                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4532       thrust::for_each(zibait,zieait,VecCUDAEquals());
4533       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4534                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4535       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4536                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4537       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4538       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4539       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4540         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4541         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4542         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4543         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4544         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4545         auto vT = CcsrT->values->begin();
4546         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4547         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4548         (*C)->transupdated = PETSC_TRUE;
4549       }
4550       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4551     }
4552   }
4553   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4554   (*C)->assembled     = PETSC_TRUE;
4555   (*C)->was_assembled = PETSC_FALSE;
4556   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4557   PetscFunctionReturn(0);
4558 }
4559 
4560 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4561 {
4562   PetscErrorCode    ierr;
4563   bool              dmem;
4564   const PetscScalar *av;
4565   cudaError_t       cerr;
4566 
4567   PetscFunctionBegin;
4568   dmem = isCudaMem(v);
4569   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4570   if (n && idx) {
4571     THRUSTINTARRAY widx(n);
4572     widx.assign(idx,idx+n);
4573     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4574 
4575     THRUSTARRAY *w = NULL;
4576     thrust::device_ptr<PetscScalar> dv;
4577     if (dmem) {
4578       dv = thrust::device_pointer_cast(v);
4579     } else {
4580       w = new THRUSTARRAY(n);
4581       dv = w->data();
4582     }
4583     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4584 
4585     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4586     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4587     thrust::for_each(zibit,zieit,VecCUDAEquals());
4588     if (w) {
4589       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4590     }
4591     delete w;
4592   } else {
4593     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4594   }
4595   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4596   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4597   PetscFunctionReturn(0);
4598 }
4599