xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision abb89eb1843dd8c6f3c2f54b4994d1c95c4713dc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
90 {
91   cusparseStatus_t   stat;
92   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
93 
94   PetscFunctionBegin;
95   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
96   cusparsestruct->stream = stream;
97   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
98   PetscFunctionReturn(0);
99 }
100 
101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
102 {
103   cusparseStatus_t   stat;
104   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
105 
106   PetscFunctionBegin;
107   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
108   if (cusparsestruct->handle != handle) {
109     if (cusparsestruct->handle) {
110       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
111     }
112     cusparsestruct->handle = handle;
113   }
114   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
119 {
120   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
121   PetscBool          flg;
122   PetscErrorCode     ierr;
123 
124   PetscFunctionBegin;
125   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
126   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
127   if (cusparsestruct->handle) cusparsestruct->handle = 0;
128   PetscFunctionReturn(0);
129 }
130 
131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
132 {
133   PetscFunctionBegin;
134   *type = MATSOLVERCUSPARSE;
135   PetscFunctionReturn(0);
136 }
137 
138 /*MC
139   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
140   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
141   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
142   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
143   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
144   algorithms are not recommended. This class does NOT support direct solver operations.
145 
146   Level: beginner
147 
148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
149 M*/
150 
151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
152 {
153   PetscErrorCode ierr;
154   PetscInt       n = A->rmap->n;
155 
156   PetscFunctionBegin;
157   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
158   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
159   (*B)->factortype = ftype;
160   (*B)->useordering = PETSC_TRUE;
161   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
162 
163   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
164     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
165     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
167   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
168     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
170   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
171 
172   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
173   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
178 {
179   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
180 
181   PetscFunctionBegin;
182   switch (op) {
183   case MAT_CUSPARSE_MULT:
184     cusparsestruct->format = format;
185     break;
186   case MAT_CUSPARSE_ALL:
187     cusparsestruct->format = format;
188     break;
189   default:
190     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*@
196    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
197    operation. Only the MatMult operation can use different GPU storage formats
198    for MPIAIJCUSPARSE matrices.
199    Not Collective
200 
201    Input Parameters:
202 +  A - Matrix of type SEQAIJCUSPARSE
203 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
204 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
205 
206    Output Parameter:
207 
208    Level: intermediate
209 
210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
211 @*/
212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
218   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 /*@
223    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
224 
225    Collective on mat
226 
227    Input Parameters:
228 +  A - Matrix of type SEQAIJCUSPARSE
229 -  transgen - the boolean flag
230 
231    Level: intermediate
232 
233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
234 @*/
235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
236 {
237   PetscErrorCode ierr;
238   PetscBool      flg;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
243   if (flg) {
244     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
245 
246     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
247     cusp->transgen = transgen;
248     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
249       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
250     }
251   }
252   PetscFunctionReturn(0);
253 }
254 
255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
256 {
257   PetscErrorCode           ierr;
258   MatCUSPARSEStorageFormat format;
259   PetscBool                flg;
260   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
261 
262   PetscFunctionBegin;
263   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
264   if (A->factortype == MAT_FACTOR_NONE) {
265     PetscBool transgen = cusparsestruct->transgen;
266 
267     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
273 
274     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
275                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
277    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
278     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
279     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
280                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
281     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
282     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
283 
284     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
285     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
286                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
287     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
288 
289     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
290     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
291                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
292     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
293    #endif
294   }
295   ierr = PetscOptionsTail();CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
302   PetscErrorCode               ierr;
303 
304   PetscFunctionBegin;
305   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
306   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
307   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
308   PetscFunctionReturn(0);
309 }
310 
311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
312 {
313   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
314   PetscErrorCode               ierr;
315 
316   PetscFunctionBegin;
317   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
318   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
319   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
324 {
325   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326   PetscErrorCode               ierr;
327 
328   PetscFunctionBegin;
329   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
336 {
337   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338   PetscErrorCode               ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
343   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
344   PetscFunctionReturn(0);
345 }
346 
347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
348 {
349   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
350   PetscInt                          n = A->rmap->n;
351   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
353   cusparseStatus_t                  stat;
354   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
355   const MatScalar                   *aa = a->a,*v;
356   PetscInt                          *AiLo, *AjLo;
357   PetscInt                          i,nz, nzLower, offset, rowOffset;
358   PetscErrorCode                    ierr;
359   cudaError_t                       cerr;
360 
361   PetscFunctionBegin;
362   if (!n) PetscFunctionReturn(0);
363   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
364     try {
365       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
366       nzLower=n+ai[n]-ai[1];
367       if (!loTriFactor) {
368         PetscScalar                       *AALo;
369 
370         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
371 
372         /* Allocate Space for the lower triangular matrix */
373         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
374         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
375 
376         /* Fill the lower triangular matrix */
377         AiLo[0]  = (PetscInt) 0;
378         AiLo[n]  = nzLower;
379         AjLo[0]  = (PetscInt) 0;
380         AALo[0]  = (MatScalar) 1.0;
381         v        = aa;
382         vi       = aj;
383         offset   = 1;
384         rowOffset= 1;
385         for (i=1; i<n; i++) {
386           nz = ai[i+1] - ai[i];
387           /* additional 1 for the term on the diagonal */
388           AiLo[i]    = rowOffset;
389           rowOffset += nz+1;
390 
391           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
392           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
393 
394           offset      += nz;
395           AjLo[offset] = (PetscInt) i;
396           AALo[offset] = (MatScalar) 1.0;
397           offset      += 1;
398 
399           v  += nz;
400           vi += nz;
401         }
402 
403         /* allocate space for the triangular factor information */
404         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
405         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
406         /* Create the matrix description */
407         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
408         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
409        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
410         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
411        #else
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
413        #endif
414         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
415         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
416 
417         /* set the operation */
418         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
419 
420         /* set the matrix */
421         loTriFactor->csrMat = new CsrMatrix;
422         loTriFactor->csrMat->num_rows = n;
423         loTriFactor->csrMat->num_cols = n;
424         loTriFactor->csrMat->num_entries = nzLower;
425 
426         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
427         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
428 
429         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
430         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
431 
432         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
433         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
434 
435         /* Create the solve analysis information */
436         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
437         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
438       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
439         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
440                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
441                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
442                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
443                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
444         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
445       #endif
446 
447         /* perform the solve analysis */
448         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
449                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
450                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
451                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
452                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
453                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
454                                #endif
455 );CHKERRCUSPARSE(stat);
456         cerr = WaitForCUDA();CHKERRCUDA(cerr);
457         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458 
459         /* assign the pointer */
460         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
461         loTriFactor->AA_h = AALo;
462         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
463         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
464         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
465       } else { /* update values only */
466         if (!loTriFactor->AA_h) {
467           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
468         }
469         /* Fill the lower triangular matrix */
470         loTriFactor->AA_h[0]  = 1.0;
471         v        = aa;
472         vi       = aj;
473         offset   = 1;
474         for (i=1; i<n; i++) {
475           nz = ai[i+1] - ai[i];
476           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
477           offset      += nz;
478           loTriFactor->AA_h[offset] = 1.0;
479           offset      += 1;
480           v  += nz;
481         }
482         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
483         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       }
485     } catch(char *ex) {
486       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
487     }
488   }
489   PetscFunctionReturn(0);
490 }
491 
492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
493 {
494   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
495   PetscInt                          n = A->rmap->n;
496   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
497   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
498   cusparseStatus_t                  stat;
499   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
500   const MatScalar                   *aa = a->a,*v;
501   PetscInt                          *AiUp, *AjUp;
502   PetscInt                          i,nz, nzUpper, offset;
503   PetscErrorCode                    ierr;
504   cudaError_t                       cerr;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* next, figure out the number of nonzeros in the upper triangular matrix. */
511       nzUpper = adiag[0]-adiag[n];
512       if (!upTriFactor) {
513         PetscScalar *AAUp;
514 
515         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
516 
517         /* Allocate Space for the upper triangular matrix */
518         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
519         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
520 
521         /* Fill the upper triangular matrix */
522         AiUp[0]=(PetscInt) 0;
523         AiUp[n]=nzUpper;
524         offset = nzUpper;
525         for (i=n-1; i>=0; i--) {
526           v  = aa + adiag[i+1] + 1;
527           vi = aj + adiag[i+1] + 1;
528 
529           /* number of elements NOT on the diagonal */
530           nz = adiag[i] - adiag[i+1]-1;
531 
532           /* decrement the offset */
533           offset -= (nz+1);
534 
535           /* first, set the diagonal elements */
536           AjUp[offset] = (PetscInt) i;
537           AAUp[offset] = (MatScalar)1./v[nz];
538           AiUp[i]      = AiUp[i+1] - (nz+1);
539 
540           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
541           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
542         }
543 
544         /* allocate space for the triangular factor information */
545         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
546         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
547 
548         /* Create the matrix description */
549         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
550         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
551        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
552         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
553        #else
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
555        #endif
556         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
557         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
558 
559         /* set the operation */
560         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
561 
562         /* set the matrix */
563         upTriFactor->csrMat = new CsrMatrix;
564         upTriFactor->csrMat->num_rows = n;
565         upTriFactor->csrMat->num_cols = n;
566         upTriFactor->csrMat->num_entries = nzUpper;
567 
568         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
569         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
570 
571         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
572         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
573 
574         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
575         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
576 
577         /* Create the solve analysis information */
578         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
579         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
581         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
582                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
585                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
586         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
587       #endif
588 
589         /* perform the solve analysis */
590         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
594                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
595                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
596                                #endif
597 );CHKERRCUSPARSE(stat);
598         cerr = WaitForCUDA();CHKERRCUDA(cerr);
599         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
600 
601         /* assign the pointer */
602         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
603         upTriFactor->AA_h = AAUp;
604         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
605         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
606         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
607       } else {
608         if (!upTriFactor->AA_h) {
609           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
610         }
611         /* Fill the upper triangular matrix */
612         offset = nzUpper;
613         for (i=n-1; i>=0; i--) {
614           v  = aa + adiag[i+1] + 1;
615 
616           /* number of elements NOT on the diagonal */
617           nz = adiag[i] - adiag[i+1]-1;
618 
619           /* decrement the offset */
620           offset -= (nz+1);
621 
622           /* first, set the diagonal elements */
623           upTriFactor->AA_h[offset] = 1./v[nz];
624           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
625         }
626         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
627         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
628       }
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           isrow = a->row,iscol = a->icol;
642   PetscBool                    row_identity,col_identity;
643   PetscInt                     n = A->rmap->n;
644 
645   PetscFunctionBegin;
646   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
647   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
648   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
649 
650   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
651   cusparseTriFactors->nnz=a->nz;
652 
653   A->offloadmask = PETSC_OFFLOAD_BOTH;
654   /* lower triangular indices */
655   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
656   if (!row_identity && !cusparseTriFactors->rpermIndices) {
657     const PetscInt *r;
658 
659     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(r, r+n);
662     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665 
666   /* upper triangular indices */
667   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
668   if (!col_identity && !cusparseTriFactors->cpermIndices) {
669     const PetscInt *c;
670 
671     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
672     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->cpermIndices->assign(c, c+n);
674     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
675     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
681 {
682   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
683   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
685   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
686   cusparseStatus_t                  stat;
687   PetscErrorCode                    ierr;
688   cudaError_t                       cerr;
689   PetscInt                          *AiUp, *AjUp;
690   PetscScalar                       *AAUp;
691   PetscScalar                       *AALo;
692   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
693   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
694   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
695   const MatScalar                   *aa = b->a,*v;
696 
697   PetscFunctionBegin;
698   if (!n) PetscFunctionReturn(0);
699   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
700     try {
701       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
702       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
703       if (!upTriFactor && !loTriFactor) {
704         /* Allocate Space for the upper triangular matrix */
705         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
706         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
707 
708         /* Fill the upper triangular matrix */
709         AiUp[0]=(PetscInt) 0;
710         AiUp[n]=nzUpper;
711         offset = 0;
712         for (i=0; i<n; i++) {
713           /* set the pointers */
714           v  = aa + ai[i];
715           vj = aj + ai[i];
716           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
717 
718           /* first, set the diagonal elements */
719           AjUp[offset] = (PetscInt) i;
720           AAUp[offset] = (MatScalar)1.0/v[nz];
721           AiUp[i]      = offset;
722           AALo[offset] = (MatScalar)1.0/v[nz];
723 
724           offset+=1;
725           if (nz>0) {
726             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
727             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
728             for (j=offset; j<offset+nz; j++) {
729               AAUp[j] = -AAUp[j];
730               AALo[j] = AAUp[j]/v[nz];
731             }
732             offset+=nz;
733           }
734         }
735 
736         /* allocate space for the triangular factor information */
737         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
738         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
739 
740         /* Create the matrix description */
741         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
742         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
743        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
744         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
745        #else
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
747        #endif
748         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
749         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
750 
751         /* set the matrix */
752         upTriFactor->csrMat = new CsrMatrix;
753         upTriFactor->csrMat->num_rows = A->rmap->n;
754         upTriFactor->csrMat->num_cols = A->cmap->n;
755         upTriFactor->csrMat->num_entries = a->nz;
756 
757         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
758         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
759 
760         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
761         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
762 
763         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
764         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
765 
766         /* set the operation */
767         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769         /* Create the solve analysis information */
770         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
771         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
772       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
773         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
774                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
775                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
776                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
777                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
778         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
779       #endif
780 
781         /* perform the solve analysis */
782         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
783                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
784                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
785                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
786                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
787                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
788                                 #endif
789 );CHKERRCUSPARSE(stat);
790         cerr = WaitForCUDA();CHKERRCUDA(cerr);
791         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
792 
793         /* assign the pointer */
794         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
795 
796         /* allocate space for the triangular factor information */
797         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
798         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
802         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
805        #else
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
807        #endif
808         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
809         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
810 
811         /* set the operation */
812         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
813 
814         /* set the matrix */
815         loTriFactor->csrMat = new CsrMatrix;
816         loTriFactor->csrMat->num_rows = A->rmap->n;
817         loTriFactor->csrMat->num_cols = A->cmap->n;
818         loTriFactor->csrMat->num_entries = a->nz;
819 
820         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
821         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
822 
823         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
824         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
825 
826         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
827         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
828 
829         /* Create the solve analysis information */
830         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
831         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
834                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
835                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
837                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
838         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
839       #endif
840 
841         /* perform the solve analysis */
842         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
843                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
844                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
845                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
846                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
848                                 #endif
849 );CHKERRCUSPARSE(stat);
850         cerr = WaitForCUDA();CHKERRCUDA(cerr);
851         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
852 
853         /* assign the pointer */
854         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
855 
856         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
857         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
858         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
859       } else {
860         /* Fill the upper triangular matrix */
861         offset = 0;
862         for (i=0; i<n; i++) {
863           /* set the pointers */
864           v  = aa + ai[i];
865           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
866 
867           /* first, set the diagonal elements */
868           AAUp[offset] = 1.0/v[nz];
869           AALo[offset] = 1.0/v[nz];
870 
871           offset+=1;
872           if (nz>0) {
873             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
874             for (j=offset; j<offset+nz; j++) {
875               AAUp[j] = -AAUp[j];
876               AALo[j] = AAUp[j]/v[nz];
877             }
878             offset+=nz;
879           }
880         }
881         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
882         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
883         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
884         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
885         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
886       }
887       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
888       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
889     } catch(char *ex) {
890       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
891     }
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
897 {
898   PetscErrorCode               ierr;
899   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
900   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
901   IS                           ip = a->row;
902   PetscBool                    perm_identity;
903   PetscInt                     n = A->rmap->n;
904 
905   PetscFunctionBegin;
906   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
908   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
909   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
910 
911   A->offloadmask = PETSC_OFFLOAD_BOTH;
912 
913   /* lower triangular indices */
914   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
915   if (!perm_identity) {
916     IS             iip;
917     const PetscInt *irip,*rip;
918 
919     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
920     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
921     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
922     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
923     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
924     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
926     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
927     ierr = ISDestroy(&iip);CHKERRQ(ierr);
928     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
929     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
935 {
936   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
937   IS             isrow = b->row,iscol = b->col;
938   PetscBool      row_identity,col_identity;
939   PetscErrorCode ierr;
940 
941   PetscFunctionBegin;
942   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
943   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
944   B->offloadmask = PETSC_OFFLOAD_CPU;
945   /* determine which version of MatSolve needs to be used. */
946   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
947   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
948   if (row_identity && col_identity) {
949     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
950     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
951     B->ops->matsolve = NULL;
952     B->ops->matsolvetranspose = NULL;
953   } else {
954     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
955     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
956     B->ops->matsolve = NULL;
957     B->ops->matsolvetranspose = NULL;
958   }
959 
960   /* get the triangular factors */
961   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
966 {
967   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
968   IS             ip = b->row;
969   PetscBool      perm_identity;
970   PetscErrorCode ierr;
971 
972   PetscFunctionBegin;
973   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
974   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
975   B->offloadmask = PETSC_OFFLOAD_CPU;
976   /* determine which version of MatSolve needs to be used. */
977   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
978   if (perm_identity) {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   } else {
984     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
985     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
986     B->ops->matsolve = NULL;
987     B->ops->matsolvetranspose = NULL;
988   }
989 
990   /* get the triangular factors */
991   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
992   PetscFunctionReturn(0);
993 }
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
996 {
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1002   cusparseStatus_t                  stat;
1003   cusparseIndexBase_t               indexBase;
1004   cusparseMatrixType_t              matrixType;
1005   cusparseFillMode_t                fillMode;
1006   cusparseDiagType_t                diagType;
1007   cudaError_t                       cerr;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* allocate space for the transpose of the lower triangular factor */
1012   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1013   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1014 
1015   /* set the matrix descriptors of the lower triangular factor */
1016   matrixType = cusparseGetMatType(loTriFactor->descr);
1017   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1018   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1019     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1020   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1021 
1022   /* Create the matrix description */
1023   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1028 
1029   /* set the operation */
1030   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1031 
1032   /* allocate GPU space for the CSC of the lower triangular factor*/
1033   loTriFactorT->csrMat = new CsrMatrix;
1034   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1035   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1036   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1037   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1038   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1039   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1040 
1041   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1043   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                                        loTriFactor->csrMat->values->data().get(),
1046                                        loTriFactor->csrMat->row_offsets->data().get(),
1047                                        loTriFactor->csrMat->column_indices->data().get(),
1048                                        loTriFactorT->csrMat->values->data().get(),
1049                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1050                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1051                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1052   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1053 #endif
1054 
1055   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1056   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1057                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1058                           loTriFactor->csrMat->values->data().get(),
1059                           loTriFactor->csrMat->row_offsets->data().get(),
1060                           loTriFactor->csrMat->column_indices->data().get(),
1061                           loTriFactorT->csrMat->values->data().get(),
1062                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1063                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1064                           CUSPARSE_ACTION_NUMERIC, indexBase,
1065                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1066                         #else
1067                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                           CUSPARSE_ACTION_NUMERIC, indexBase
1069                         #endif
1070 );CHKERRCUSPARSE(stat);
1071   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1072   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1073 
1074   /* Create the solve analysis information */
1075   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1076   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1078   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1080                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1081                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1082                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1083   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1084 #endif
1085 
1086   /* perform the solve analysis */
1087   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1088                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1089                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1091                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1092                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1093                           #endif
1094 );CHKERRCUSPARSE(stat);
1095   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1096   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1097 
1098   /* assign the pointer */
1099   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100 
1101   /*********************************************/
1102   /* Now the Transpose of the Upper Tri Factor */
1103   /*********************************************/
1104 
1105   /* allocate space for the transpose of the upper triangular factor */
1106   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1107   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108 
1109   /* set the matrix descriptors of the upper triangular factor */
1110   matrixType = cusparseGetMatType(upTriFactor->descr);
1111   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1112   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1113     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1114   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115 
1116   /* Create the matrix description */
1117   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122 
1123   /* set the operation */
1124   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125 
1126   /* allocate GPU space for the CSC of the upper triangular factor*/
1127   upTriFactorT->csrMat = new CsrMatrix;
1128   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1129   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1130   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1131   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1132   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1133   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134 
1135   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1137   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1138                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                                 upTriFactor->csrMat->values->data().get(),
1140                                 upTriFactor->csrMat->row_offsets->data().get(),
1141                                 upTriFactor->csrMat->column_indices->data().get(),
1142                                 upTriFactorT->csrMat->values->data().get(),
1143                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1145                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1146   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1147 #endif
1148 
1149   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1150   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1151                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1152                           upTriFactor->csrMat->values->data().get(),
1153                           upTriFactor->csrMat->row_offsets->data().get(),
1154                           upTriFactor->csrMat->column_indices->data().get(),
1155                           upTriFactorT->csrMat->values->data().get(),
1156                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1157                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1158                           CUSPARSE_ACTION_NUMERIC, indexBase,
1159                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1160                         #else
1161                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                           CUSPARSE_ACTION_NUMERIC, indexBase
1163                         #endif
1164 );CHKERRCUSPARSE(stat);
1165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1166   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1167 
1168   /* Create the solve analysis information */
1169   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1170   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1171   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1172   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1176                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1177   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1178   #endif
1179 
1180   /* perform the solve analysis */
1181   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1182                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1183                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1184                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1185                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1186                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1187                           #endif
1188 );CHKERRCUSPARSE(stat);
1189   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1190   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1191 
1192   /* assign the pointer */
1193   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1198 {
1199   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1200   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   cusparseStatus_t             stat;
1204   cusparseIndexBase_t          indexBase;
1205   cudaError_t                  err;
1206   PetscErrorCode               ierr;
1207 
1208   PetscFunctionBegin;
1209   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1212   /* create cusparse matrix */
1213   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1214   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1215   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1216   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1217   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1218 
1219   /* set alpha and beta */
1220   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1221   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1222   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1227 
1228   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1229     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1230     CsrMatrix *matrixT= new CsrMatrix;
1231     matrixT->num_rows = A->cmap->n;
1232     matrixT->num_cols = A->rmap->n;
1233     matrixT->num_entries = a->nz;
1234     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1235     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1236     matrixT->values = new THRUSTARRAY(a->nz);
1237 
1238     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1239     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1240 
1241     /* compute the transpose, i.e. the CSC */
1242    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1243     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1244                                   A->cmap->n, matrix->num_entries,
1245                                   matrix->values->data().get(),
1246                                   cusparsestruct->rowoffsets_gpu->data().get(),
1247                                   matrix->column_indices->data().get(),
1248                                   matrixT->values->data().get(),
1249                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1251                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1252     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1253    #endif
1254 
1255     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1256                             A->cmap->n, matrix->num_entries,
1257                             matrix->values->data().get(),
1258                             cusparsestruct->rowoffsets_gpu->data().get(),
1259                             matrix->column_indices->data().get(),
1260                             matrixT->values->data().get(),
1261                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1263                             CUSPARSE_ACTION_NUMERIC,indexBase,
1264                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1265                           #else
1266                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1267                             CUSPARSE_ACTION_NUMERIC, indexBase
1268                           #endif
1269 );CHKERRCUSPARSE(stat);
1270     matstructT->mat = matrixT;
1271 
1272    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273     stat = cusparseCreateCsr(&matstructT->matDescr,
1274                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1275                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1276                              matrixT->values->data().get(),
1277                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1278                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1279    #endif
1280   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1281    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1282     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1283    #else
1284     CsrMatrix *temp  = new CsrMatrix;
1285     CsrMatrix *tempT = new CsrMatrix;
1286     /* First convert HYB to CSR */
1287     temp->num_rows = A->rmap->n;
1288     temp->num_cols = A->cmap->n;
1289     temp->num_entries = a->nz;
1290     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1292     temp->values = new THRUSTARRAY(a->nz);
1293 
1294     stat = cusparse_hyb2csr(cusparsestruct->handle,
1295                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1296                             temp->values->data().get(),
1297                             temp->row_offsets->data().get(),
1298                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1299 
1300     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1301     tempT->num_rows = A->rmap->n;
1302     tempT->num_cols = A->cmap->n;
1303     tempT->num_entries = a->nz;
1304     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1305     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1306     tempT->values = new THRUSTARRAY(a->nz);
1307 
1308     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1309                             temp->num_cols, temp->num_entries,
1310                             temp->values->data().get(),
1311                             temp->row_offsets->data().get(),
1312                             temp->column_indices->data().get(),
1313                             tempT->values->data().get(),
1314                             tempT->column_indices->data().get(),
1315                             tempT->row_offsets->data().get(),
1316                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1317 
1318     /* Last, convert CSC to HYB */
1319     cusparseHybMat_t hybMat;
1320     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1321     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1322       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1323     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1324                             matstructT->descr, tempT->values->data().get(),
1325                             tempT->row_offsets->data().get(),
1326                             tempT->column_indices->data().get(),
1327                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1328 
1329     /* assign the pointer */
1330     matstructT->mat = hybMat;
1331     /* delete temporaries */
1332     if (tempT) {
1333       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1334       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1335       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1336       delete (CsrMatrix*) tempT;
1337     }
1338     if (temp) {
1339       if (temp->values) delete (THRUSTARRAY*) temp->values;
1340       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1341       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1342       delete (CsrMatrix*) temp;
1343     }
1344    #endif
1345   }
1346   err  = WaitForCUDA();CHKERRCUDA(err);
1347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1348   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1349   /* the compressed row indices is not used for matTranspose */
1350   matstructT->cprowIndices = NULL;
1351   /* assign the pointer */
1352   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1358 {
1359   PetscInt                              n = xx->map->n;
1360   const PetscScalar                     *barray;
1361   PetscScalar                           *xarray;
1362   thrust::device_ptr<const PetscScalar> bGPU;
1363   thrust::device_ptr<PetscScalar>       xGPU;
1364   cusparseStatus_t                      stat;
1365   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1366   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1367   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1368   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1369   PetscErrorCode                        ierr;
1370   cudaError_t                           cerr;
1371 
1372   PetscFunctionBegin;
1373   /* Analyze the matrix and create the transpose ... on the fly */
1374   if (!loTriFactorT && !upTriFactorT) {
1375     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1376     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1377     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1378   }
1379 
1380   /* Get the GPU pointers */
1381   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1382   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1383   xGPU = thrust::device_pointer_cast(xarray);
1384   bGPU = thrust::device_pointer_cast(barray);
1385 
1386   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1387   /* First, reorder with the row permutation */
1388   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1389                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1390                xGPU);
1391 
1392   /* First, solve U */
1393   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1394                         upTriFactorT->csrMat->num_rows,
1395                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1396                         upTriFactorT->csrMat->num_entries,
1397                       #endif
1398                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1399                         upTriFactorT->csrMat->values->data().get(),
1400                         upTriFactorT->csrMat->row_offsets->data().get(),
1401                         upTriFactorT->csrMat->column_indices->data().get(),
1402                         upTriFactorT->solveInfo,
1403                         xarray, tempGPU->data().get()
1404                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1405                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1406                       #endif
1407 );CHKERRCUSPARSE(stat);
1408 
1409   /* Then, solve L */
1410   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1411                         loTriFactorT->csrMat->num_rows,
1412                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1413                         loTriFactorT->csrMat->num_entries,
1414                       #endif
1415                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1416                         loTriFactorT->csrMat->values->data().get(),
1417                         loTriFactorT->csrMat->row_offsets->data().get(),
1418                         loTriFactorT->csrMat->column_indices->data().get(),
1419                         loTriFactorT->solveInfo,
1420                         tempGPU->data().get(), xarray
1421                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1422                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1423                       #endif
1424 );CHKERRCUSPARSE(stat);
1425 
1426   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1427   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1428                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1429                tempGPU->begin());
1430 
1431   /* Copy the temporary to the full solution. */
1432   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1433 
1434   /* restore */
1435   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1437   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1438   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   const PetscScalar                 *barray;
1446   PetscScalar                       *xarray;
1447   cusparseStatus_t                  stat;
1448   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1449   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1450   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1451   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1452   PetscErrorCode                    ierr;
1453   cudaError_t                       cerr;
1454 
1455   PetscFunctionBegin;
1456   /* Analyze the matrix and create the transpose ... on the fly */
1457   if (!loTriFactorT && !upTriFactorT) {
1458     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1459     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   }
1462 
1463   /* Get the GPU pointers */
1464   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, solve U */
1469   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1470                         upTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         upTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1475                         upTriFactorT->csrMat->values->data().get(),
1476                         upTriFactorT->csrMat->row_offsets->data().get(),
1477                         upTriFactorT->csrMat->column_indices->data().get(),
1478                         upTriFactorT->solveInfo,
1479                         barray, tempGPU->data().get()
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Then, solve L */
1486   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1487                         loTriFactorT->csrMat->num_rows,
1488                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1489                         loTriFactorT->csrMat->num_entries,
1490                       #endif
1491                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1492                         loTriFactorT->csrMat->values->data().get(),
1493                         loTriFactorT->csrMat->row_offsets->data().get(),
1494                         loTriFactorT->csrMat->column_indices->data().get(),
1495                         loTriFactorT->solveInfo,
1496                         tempGPU->data().get(), xarray
1497                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1498                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1499                       #endif
1500 );CHKERRCUSPARSE(stat);
1501 
1502   /* restore */
1503   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1504   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1505   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1506   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1512 {
1513   const PetscScalar                     *barray;
1514   PetscScalar                           *xarray;
1515   thrust::device_ptr<const PetscScalar> bGPU;
1516   thrust::device_ptr<PetscScalar>       xGPU;
1517   cusparseStatus_t                      stat;
1518   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1519   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1520   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1521   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1522   PetscErrorCode                        ierr;
1523   cudaError_t                           cerr;
1524 
1525   PetscFunctionBegin;
1526 
1527   /* Get the GPU pointers */
1528   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530   xGPU = thrust::device_pointer_cast(xarray);
1531   bGPU = thrust::device_pointer_cast(barray);
1532 
1533   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534   /* First, reorder with the row permutation */
1535   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1536                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1537                tempGPU->begin());
1538 
1539   /* Next, solve L */
1540   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1541                         loTriFactor->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         loTriFactor->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1546                         loTriFactor->csrMat->values->data().get(),
1547                         loTriFactor->csrMat->row_offsets->data().get(),
1548                         loTriFactor->csrMat->column_indices->data().get(),
1549                         loTriFactor->solveInfo,
1550                         tempGPU->data().get(), xarray
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1553                       #endif
1554 );CHKERRCUSPARSE(stat);
1555 
1556   /* Then, solve U */
1557   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1558                         upTriFactor->csrMat->num_rows,
1559                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1560                         upTriFactor->csrMat->num_entries,
1561                       #endif
1562                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1563                         upTriFactor->csrMat->values->data().get(),
1564                         upTriFactor->csrMat->row_offsets->data().get(),
1565                         upTriFactor->csrMat->column_indices->data().get(),
1566                         upTriFactor->solveInfo,
1567                         xarray, tempGPU->data().get()
1568                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1570                       #endif
1571 );CHKERRCUSPARSE(stat);
1572 
1573   /* Last, reorder with the column permutation */
1574   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1576                xGPU);
1577 
1578   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1581   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   cusparseStatus_t                  stat;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595   PetscErrorCode                    ierr;
1596   cudaError_t                       cerr;
1597 
1598   PetscFunctionBegin;
1599   /* Get the GPU pointers */
1600   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1601   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1602 
1603   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1604   /* First, solve L */
1605   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1606                         loTriFactor->csrMat->num_rows,
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         loTriFactor->csrMat->num_entries,
1609                       #endif
1610                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1611                         loTriFactor->csrMat->values->data().get(),
1612                         loTriFactor->csrMat->row_offsets->data().get(),
1613                         loTriFactor->csrMat->column_indices->data().get(),
1614                         loTriFactor->solveInfo,
1615                         barray, tempGPU->data().get()
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1618                       #endif
1619 );CHKERRCUSPARSE(stat);
1620 
1621   /* Next, solve U */
1622   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1623                         upTriFactor->csrMat->num_rows,
1624                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1625                         upTriFactor->csrMat->num_entries,
1626                       #endif
1627                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1628                         upTriFactor->csrMat->values->data().get(),
1629                         upTriFactor->csrMat->row_offsets->data().get(),
1630                         upTriFactor->csrMat->column_indices->data().get(),
1631                         upTriFactor->solveInfo,
1632                         tempGPU->data().get(), xarray
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1635                       #endif
1636 );CHKERRCUSPARSE(stat);
1637 
1638   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1639   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1641   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1642   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1650   cudaError_t        cerr;
1651   PetscErrorCode     ierr;
1652 
1653   PetscFunctionBegin;
1654   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1655     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1656 
1657     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1658     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1659     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1660     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1661     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1662     A->offloadmask = PETSC_OFFLOAD_BOTH;
1663   }
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1668 {
1669   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1674   *array = a->a;
1675   A->offloadmask = PETSC_OFFLOAD_CPU;
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1680 {
1681   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1683   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1684   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1685   PetscErrorCode               ierr;
1686   cusparseStatus_t             stat;
1687   PetscBool                    both = PETSC_TRUE;
1688   cudaError_t                  err;
1689 
1690   PetscFunctionBegin;
1691   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1692   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1693     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1694       /* Copy values only */
1695       CsrMatrix *matrix,*matrixT;
1696       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1697 
1698       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1699       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1700       matrix->values->assign(a->a, a->a+a->nz);
1701       err  = WaitForCUDA();CHKERRCUDA(err);
1702       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1703       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1704 
1705       /* Update matT when it was built before */
1706       if (cusparsestruct->matTranspose) {
1707         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1708         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1709         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1710         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1711                             A->cmap->n, matrix->num_entries,
1712                             matrix->values->data().get(),
1713                             cusparsestruct->rowoffsets_gpu->data().get(),
1714                             matrix->column_indices->data().get(),
1715                             matrixT->values->data().get(),
1716                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1717                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1718                             CUSPARSE_ACTION_NUMERIC,indexBase,
1719                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1720                           #else
1721                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1722                             CUSPARSE_ACTION_NUMERIC, indexBase
1723                           #endif
1724 );CHKERRCUSPARSE(stat);
1725         err  = WaitForCUDA();CHKERRCUDA(err);
1726         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1727       }
1728     } else {
1729       PetscInt nnz;
1730       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1731       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1732       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1733       delete cusparsestruct->workVector;
1734       delete cusparsestruct->rowoffsets_gpu;
1735       try {
1736         if (a->compressedrow.use) {
1737           m    = a->compressedrow.nrows;
1738           ii   = a->compressedrow.i;
1739           ridx = a->compressedrow.rindex;
1740         } else {
1741           m    = A->rmap->n;
1742           ii   = a->i;
1743           ridx = NULL;
1744         }
1745         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1746         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1747         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1748         else nnz = a->nz;
1749 
1750         /* create cusparse matrix */
1751         cusparsestruct->nrows = m;
1752         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1753         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1754         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1755         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1756 
1757         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1758         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1759         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1760         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1761         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1762         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1763         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1764 
1765         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1766         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1767           /* set the matrix */
1768           CsrMatrix *mat= new CsrMatrix;
1769           mat->num_rows = m;
1770           mat->num_cols = A->cmap->n;
1771           mat->num_entries = nnz;
1772           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1773           mat->row_offsets->assign(ii, ii + m+1);
1774 
1775           mat->column_indices = new THRUSTINTARRAY32(nnz);
1776           mat->column_indices->assign(a->j, a->j+nnz);
1777 
1778           mat->values = new THRUSTARRAY(nnz);
1779           if (a->a) mat->values->assign(a->a, a->a+nnz);
1780 
1781           /* assign the pointer */
1782           matstruct->mat = mat;
1783          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1784           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1785             stat = cusparseCreateCsr(&matstruct->matDescr,
1786                                     mat->num_rows, mat->num_cols, mat->num_entries,
1787                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1788                                     mat->values->data().get(),
1789                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1790                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1791           }
1792          #endif
1793         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1794          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1795           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1796          #else
1797           CsrMatrix *mat= new CsrMatrix;
1798           mat->num_rows = m;
1799           mat->num_cols = A->cmap->n;
1800           mat->num_entries = nnz;
1801           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1802           mat->row_offsets->assign(ii, ii + m+1);
1803 
1804           mat->column_indices = new THRUSTINTARRAY32(nnz);
1805           mat->column_indices->assign(a->j, a->j+nnz);
1806 
1807           mat->values = new THRUSTARRAY(nnz);
1808           if (a->a) mat->values->assign(a->a, a->a+nnz);
1809 
1810           cusparseHybMat_t hybMat;
1811           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1812           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1813             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1814           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1815               matstruct->descr, mat->values->data().get(),
1816               mat->row_offsets->data().get(),
1817               mat->column_indices->data().get(),
1818               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1819           /* assign the pointer */
1820           matstruct->mat = hybMat;
1821 
1822           if (mat) {
1823             if (mat->values) delete (THRUSTARRAY*)mat->values;
1824             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1825             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1826             delete (CsrMatrix*)mat;
1827           }
1828          #endif
1829         }
1830 
1831         /* assign the compressed row indices */
1832         if (a->compressedrow.use) {
1833           cusparsestruct->workVector = new THRUSTARRAY(m);
1834           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1835           matstruct->cprowIndices->assign(ridx,ridx+m);
1836           tmp = m;
1837         } else {
1838           cusparsestruct->workVector = NULL;
1839           matstruct->cprowIndices    = NULL;
1840           tmp = 0;
1841         }
1842         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1843 
1844         /* assign the pointer */
1845         cusparsestruct->mat = matstruct;
1846       } catch(char *ex) {
1847         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1848       }
1849       err  = WaitForCUDA();CHKERRCUDA(err);
1850       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1851       cusparsestruct->nonzerostate = A->nonzerostate;
1852     }
1853     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1854   }
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 struct VecCUDAPlusEquals
1859 {
1860   template <typename Tuple>
1861   __host__ __device__
1862   void operator()(Tuple t)
1863   {
1864     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1865   }
1866 };
1867 
1868 struct VecCUDAEquals
1869 {
1870   template <typename Tuple>
1871   __host__ __device__
1872   void operator()(Tuple t)
1873   {
1874     thrust::get<1>(t) = thrust::get<0>(t);
1875   }
1876 };
1877 
1878 struct VecCUDAEqualsReverse
1879 {
1880   template <typename Tuple>
1881   __host__ __device__
1882   void operator()(Tuple t)
1883   {
1884     thrust::get<0>(t) = thrust::get<1>(t);
1885   }
1886 };
1887 
1888 struct MatMatCusparse {
1889   PetscBool             cisdense;
1890   PetscScalar           *Bt;
1891   Mat                   X;
1892   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1893   PetscLogDouble        flops;
1894   CsrMatrix             *Bcsr;
1895 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1896   cusparseSpMatDescr_t  matSpBDescr;
1897   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1898   cusparseDnMatDescr_t  matBDescr;
1899   cusparseDnMatDescr_t  matCDescr;
1900   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1901   size_t                mmBufferSize;
1902   void                  *mmBuffer;
1903   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1904   cusparseSpGEMMDescr_t spgemmDesc;
1905 #endif
1906 };
1907 
1908 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1909 {
1910   PetscErrorCode   ierr;
1911   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1912   cudaError_t      cerr;
1913  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1914   cusparseStatus_t stat;
1915  #endif
1916 
1917   PetscFunctionBegin;
1918   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1919   delete mmdata->Bcsr;
1920  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1921   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1922   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1923   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1924   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1925   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1926   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1927  #endif
1928   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1929   ierr = PetscFree(data);CHKERRQ(ierr);
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1934 
1935 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1936 {
1937   Mat_Product                  *product = C->product;
1938   Mat                          A,B;
1939   PetscInt                     m,n,blda,clda;
1940   PetscBool                    flg,biscuda;
1941   Mat_SeqAIJCUSPARSE           *cusp;
1942   cusparseStatus_t             stat;
1943   cusparseOperation_t          opA;
1944   const PetscScalar            *barray;
1945   PetscScalar                  *carray;
1946   PetscErrorCode               ierr;
1947   MatMatCusparse               *mmdata;
1948   Mat_SeqAIJCUSPARSEMultStruct *mat;
1949   CsrMatrix                    *csrmat;
1950   cudaError_t                  cerr;
1951 
1952   PetscFunctionBegin;
1953   MatCheckProduct(C,1);
1954   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1955   mmdata = (MatMatCusparse*)product->data;
1956   A    = product->A;
1957   B    = product->B;
1958   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1959   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1960   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1961      Instead of silently accepting the wrong answer, I prefer to raise the error */
1962   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1963   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1964   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1965   switch (product->type) {
1966   case MATPRODUCT_AB:
1967   case MATPRODUCT_PtAP:
1968     mat = cusp->mat;
1969     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1970     m   = A->rmap->n;
1971     n   = B->cmap->n;
1972     break;
1973   case MATPRODUCT_AtB:
1974     if (!cusp->transgen) {
1975       mat = cusp->mat;
1976       opA = CUSPARSE_OPERATION_TRANSPOSE;
1977     } else {
1978       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1979       mat  = cusp->matTranspose;
1980       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1981     }
1982     m = A->cmap->n;
1983     n = B->cmap->n;
1984     break;
1985   case MATPRODUCT_ABt:
1986   case MATPRODUCT_RARt:
1987     mat = cusp->mat;
1988     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1989     m   = A->rmap->n;
1990     n   = B->rmap->n;
1991     break;
1992   default:
1993     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1994   }
1995   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1996   csrmat = (CsrMatrix*)mat->mat;
1997   /* if the user passed a CPU matrix, copy the data to the GPU */
1998   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1999   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2000   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2001 
2002   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2003   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2004     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2005     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2006   } else {
2007     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2008     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2009   }
2010 
2011   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2012  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2013   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2014   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2015   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2016     size_t mmBufferSize;
2017     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2018     if (!mmdata->matBDescr) {
2019       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2020       mmdata->Blda = blda;
2021     }
2022 
2023     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2024     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2025       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2026       mmdata->Clda = clda;
2027     }
2028 
2029     if (!mat->matDescr) {
2030       stat = cusparseCreateCsr(&mat->matDescr,
2031                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2032                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2033                                csrmat->values->data().get(),
2034                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2035                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2036     }
2037     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2038                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2039                                    mmdata->matCDescr,cusparse_scalartype,
2040                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2041     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2042       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2043       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2044       mmdata->mmBufferSize = mmBufferSize;
2045     }
2046     mmdata->initialized = PETSC_TRUE;
2047   } else {
2048     /* to be safe, always update pointers of the mats */
2049     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2050     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2051     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2052   }
2053 
2054   /* do cusparseSpMM, which supports transpose on B */
2055   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2056                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2057                       mmdata->matCDescr,cusparse_scalartype,
2058                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2059  #else
2060   PetscInt k;
2061   /* cusparseXcsrmm does not support transpose on B */
2062   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2063     cublasHandle_t cublasv2handle;
2064     cublasStatus_t cerr;
2065 
2066     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2067     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2068                        B->cmap->n,B->rmap->n,
2069                        &PETSC_CUSPARSE_ONE ,barray,blda,
2070                        &PETSC_CUSPARSE_ZERO,barray,blda,
2071                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2072     blda = B->cmap->n;
2073     k    = B->cmap->n;
2074   } else {
2075     k    = B->rmap->n;
2076   }
2077 
2078   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2079   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2080                            csrmat->num_entries,mat->alpha_one,mat->descr,
2081                            csrmat->values->data().get(),
2082                            csrmat->row_offsets->data().get(),
2083                            csrmat->column_indices->data().get(),
2084                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2085                            carray,clda);CHKERRCUSPARSE(stat);
2086  #endif
2087   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2088   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2089   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2090   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2091   if (product->type == MATPRODUCT_RARt) {
2092     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2093     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2094   } else if (product->type == MATPRODUCT_PtAP) {
2095     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2097   } else {
2098     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2099   }
2100   if (mmdata->cisdense) {
2101     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2102   }
2103   if (!biscuda) {
2104     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2105   }
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2110 {
2111   Mat_Product        *product = C->product;
2112   Mat                A,B;
2113   PetscInt           m,n;
2114   PetscBool          cisdense,flg;
2115   PetscErrorCode     ierr;
2116   MatMatCusparse     *mmdata;
2117   Mat_SeqAIJCUSPARSE *cusp;
2118 
2119   PetscFunctionBegin;
2120   MatCheckProduct(C,1);
2121   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2122   A    = product->A;
2123   B    = product->B;
2124   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2125   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2126   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2127   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2128   switch (product->type) {
2129   case MATPRODUCT_AB:
2130     m = A->rmap->n;
2131     n = B->cmap->n;
2132     break;
2133   case MATPRODUCT_AtB:
2134     m = A->cmap->n;
2135     n = B->cmap->n;
2136     break;
2137   case MATPRODUCT_ABt:
2138     m = A->rmap->n;
2139     n = B->rmap->n;
2140     break;
2141   case MATPRODUCT_PtAP:
2142     m = B->cmap->n;
2143     n = B->cmap->n;
2144     break;
2145   case MATPRODUCT_RARt:
2146     m = B->rmap->n;
2147     n = B->rmap->n;
2148     break;
2149   default:
2150     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2151   }
2152   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2153   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2154   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2155   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2156 
2157   /* product data */
2158   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2159   mmdata->cisdense = cisdense;
2160  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2161   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2162   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2163     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2164   }
2165  #endif
2166   /* for these products we need intermediate storage */
2167   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2168     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2169     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2170     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2171       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2172     } else {
2173       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2174     }
2175   }
2176   C->product->data    = mmdata;
2177   C->product->destroy = MatDestroy_MatMatCusparse;
2178 
2179   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2184 {
2185   Mat_Product                  *product = C->product;
2186   Mat                          A,B;
2187   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2188   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2189   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2190   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2191   PetscBool                    flg;
2192   PetscErrorCode               ierr;
2193   cusparseStatus_t             stat;
2194   cudaError_t                  cerr;
2195   MatProductType               ptype;
2196   MatMatCusparse               *mmdata;
2197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2198   cusparseSpMatDescr_t         BmatSpDescr;
2199 #endif
2200 
2201   PetscFunctionBegin;
2202   MatCheckProduct(C,1);
2203   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2204   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2205   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2206   mmdata = (MatMatCusparse*)C->product->data;
2207   A = product->A;
2208   B = product->B;
2209   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2210     mmdata->reusesym = PETSC_FALSE;
2211     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2212     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2213     Cmat = Ccusp->mat;
2214     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2215     Ccsr = (CsrMatrix*)Cmat->mat;
2216     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2217     goto finalize;
2218   }
2219   if (!c->nz) goto finalize;
2220   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2221   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2222   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2224   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2225   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2226   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2227   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2228   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2229   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2230   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2231   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2233   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2234 
2235   ptype = product->type;
2236   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2237   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2238   switch (ptype) {
2239   case MATPRODUCT_AB:
2240     Amat = Acusp->mat;
2241     Bmat = Bcusp->mat;
2242     break;
2243   case MATPRODUCT_AtB:
2244     Amat = Acusp->matTranspose;
2245     Bmat = Bcusp->mat;
2246     break;
2247   case MATPRODUCT_ABt:
2248     Amat = Acusp->mat;
2249     Bmat = Bcusp->matTranspose;
2250     break;
2251   default:
2252     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2253   }
2254   Cmat = Ccusp->mat;
2255   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2256   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2257   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2258   Acsr = (CsrMatrix*)Amat->mat;
2259   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2260   Ccsr = (CsrMatrix*)Cmat->mat;
2261   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2262   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2263   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2264   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2266   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2267   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2268                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2269                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2270                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2271   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2272                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2273                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2274 #else
2275   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2276                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2277                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2278                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2279                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2280 #endif
2281   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2282   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2283   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2284   C->offloadmask = PETSC_OFFLOAD_GPU;
2285 finalize:
2286   /* shorter version of MatAssemblyEnd_SeqAIJ */
2287   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2288   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2289   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2290   c->reallocs         = 0;
2291   C->info.mallocs    += 0;
2292   C->info.nz_unneeded = 0;
2293   C->assembled = C->was_assembled = PETSC_TRUE;
2294   C->num_ass++;
2295   PetscFunctionReturn(0);
2296 }
2297 
2298 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2299 {
2300   Mat_Product                  *product = C->product;
2301   Mat                          A,B;
2302   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2303   Mat_SeqAIJ                   *a,*b,*c;
2304   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2305   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2306   PetscInt                     i,j,m,n,k;
2307   PetscBool                    flg;
2308   PetscErrorCode               ierr;
2309   cusparseStatus_t             stat;
2310   cudaError_t                  cerr;
2311   MatProductType               ptype;
2312   MatMatCusparse               *mmdata;
2313   PetscLogDouble               flops;
2314   PetscBool                    biscompressed,ciscompressed;
2315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2316   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2317   size_t                       bufSize2;
2318   cusparseSpMatDescr_t         BmatSpDescr;
2319 #else
2320   int                          cnz;
2321 #endif
2322 
2323   PetscFunctionBegin;
2324   MatCheckProduct(C,1);
2325   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2326   A    = product->A;
2327   B    = product->B;
2328   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2329   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2330   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2332   a = (Mat_SeqAIJ*)A->data;
2333   b = (Mat_SeqAIJ*)B->data;
2334   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2335   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2336   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2337   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2338 
2339   /* product data */
2340   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2341   C->product->data    = mmdata;
2342   C->product->destroy = MatDestroy_MatMatCusparse;
2343 
2344   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2345   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2346   ptype = product->type;
2347   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2348   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2349   biscompressed = PETSC_FALSE;
2350   ciscompressed = PETSC_FALSE;
2351   switch (ptype) {
2352   case MATPRODUCT_AB:
2353     m = A->rmap->n;
2354     n = B->cmap->n;
2355     k = A->cmap->n;
2356     Amat = Acusp->mat;
2357     Bmat = Bcusp->mat;
2358     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2359     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2360     break;
2361   case MATPRODUCT_AtB:
2362     m = A->cmap->n;
2363     n = B->cmap->n;
2364     k = A->rmap->n;
2365     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2366     Amat = Acusp->matTranspose;
2367     Bmat = Bcusp->mat;
2368     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2369     break;
2370   case MATPRODUCT_ABt:
2371     m = A->rmap->n;
2372     n = B->rmap->n;
2373     k = A->cmap->n;
2374     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2375     Amat = Acusp->mat;
2376     Bmat = Bcusp->matTranspose;
2377     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2378     break;
2379   default:
2380     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2381   }
2382 
2383   /* create cusparse matrix */
2384   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2385   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2386   c     = (Mat_SeqAIJ*)C->data;
2387   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2389   Ccsr  = new CsrMatrix;
2390 
2391   c->compressedrow.use = ciscompressed;
2392   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2393     c->compressedrow.nrows = a->compressedrow.nrows;
2394     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2395     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2396     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2397     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2398     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2399   } else {
2400     c->compressedrow.nrows  = 0;
2401     c->compressedrow.i      = NULL;
2402     c->compressedrow.rindex = NULL;
2403     Ccusp->workVector       = NULL;
2404     Cmat->cprowIndices      = NULL;
2405   }
2406   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2407   Ccusp->mat      = Cmat;
2408   Ccusp->mat->mat = Ccsr;
2409   Ccsr->num_rows    = Ccusp->nrows;
2410   Ccsr->num_cols    = n;
2411   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2412   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2413   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2414   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2415   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2416   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2417   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2418   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2419   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2420   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2422     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2423     c->nz = 0;
2424     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2425     Ccsr->values = new THRUSTARRAY(c->nz);
2426     goto finalizesym;
2427   }
2428 
2429   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2430   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2431   Acsr = (CsrMatrix*)Amat->mat;
2432   if (!biscompressed) {
2433     Bcsr = (CsrMatrix*)Bmat->mat;
2434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2435     BmatSpDescr = Bmat->matDescr;
2436 #endif
2437   } else { /* we need to use row offsets for the full matrix */
2438     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2439     Bcsr = new CsrMatrix;
2440     Bcsr->num_rows       = B->rmap->n;
2441     Bcsr->num_cols       = cBcsr->num_cols;
2442     Bcsr->num_entries    = cBcsr->num_entries;
2443     Bcsr->column_indices = cBcsr->column_indices;
2444     Bcsr->values         = cBcsr->values;
2445     if (!Bcusp->rowoffsets_gpu) {
2446       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2447       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2448       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2449     }
2450     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2451     mmdata->Bcsr = Bcsr;
2452 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2453     if (Bcsr->num_rows && Bcsr->num_cols) {
2454       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2455                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2456                                Bcsr->values->data().get(),
2457                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2458                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2459     }
2460     BmatSpDescr = mmdata->matSpBDescr;
2461 #endif
2462   }
2463   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2464   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2465   /* precompute flops count */
2466   if (ptype == MATPRODUCT_AB) {
2467     for (i=0, flops = 0; i<A->rmap->n; i++) {
2468       const PetscInt st = a->i[i];
2469       const PetscInt en = a->i[i+1];
2470       for (j=st; j<en; j++) {
2471         const PetscInt brow = a->j[j];
2472         flops += 2.*(b->i[brow+1] - b->i[brow]);
2473       }
2474     }
2475   } else if (ptype == MATPRODUCT_AtB) {
2476     for (i=0, flops = 0; i<A->rmap->n; i++) {
2477       const PetscInt anzi = a->i[i+1] - a->i[i];
2478       const PetscInt bnzi = b->i[i+1] - b->i[i];
2479       flops += (2.*anzi)*bnzi;
2480     }
2481   } else { /* TODO */
2482     flops = 0.;
2483   }
2484 
2485   mmdata->flops = flops;
2486   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2488   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2489   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2490                            NULL, NULL, NULL,
2491                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2492                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2493   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2494   /* ask bufferSize bytes for external memory */
2495   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2496                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2497                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2498                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2499   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2500   /* inspect the matrices A and B to understand the memory requirement for the next step */
2501   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2502                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2503                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2504                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2505   /* ask bufferSize again bytes for external memory */
2506   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2507                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2508                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2509                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2510   /* The CUSPARSE documentation is not clear, nor the API
2511      We need both buffers to perform the operations properly!
2512      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2513      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2514      is stored in the descriptor! What a messy API... */
2515   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2516   /* compute the intermediate product of A * B */
2517   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2518                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2519                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2520                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2521   /* get matrix C non-zero entries C_nnz1 */
2522   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2523   c->nz = (PetscInt) C_nnz1;
2524   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2525   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2526   Ccsr->values = new THRUSTARRAY(c->nz);
2527   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2528   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2529                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2530   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2532                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2533 #else
2534   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2535   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2536                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2537                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2538                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2539                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2540   c->nz = cnz;
2541   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2542   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2543   Ccsr->values = new THRUSTARRAY(c->nz);
2544   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545 
2546   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2547   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2548      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2549      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2550   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2551                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2552                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2553                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2554                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2555 #endif
2556   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2557   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2558   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2559 finalizesym:
2560   c->singlemalloc = PETSC_FALSE;
2561   c->free_a       = PETSC_TRUE;
2562   c->free_ij      = PETSC_TRUE;
2563   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2564   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2565   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2566     PetscInt *d_i = c->i;
2567     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2568     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2569     ii   = *Ccsr->row_offsets;
2570     jj   = *Ccsr->column_indices;
2571     if (ciscompressed) d_i = c->compressedrow.i;
2572     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2573     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2574   } else {
2575     PetscInt *d_i = c->i;
2576     if (ciscompressed) d_i = c->compressedrow.i;
2577     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2579   }
2580   if (ciscompressed) { /* need to expand host row offsets */
2581     PetscInt r = 0;
2582     c->i[0] = 0;
2583     for (k = 0; k < c->compressedrow.nrows; k++) {
2584       const PetscInt next = c->compressedrow.rindex[k];
2585       const PetscInt old = c->compressedrow.i[k];
2586       for (; r < next; r++) c->i[r+1] = old;
2587     }
2588     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2589   }
2590   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2591   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2592   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2593   c->maxnz = c->nz;
2594   c->nonzerorowcnt = 0;
2595   c->rmax = 0;
2596   for (k = 0; k < m; k++) {
2597     const PetscInt nn = c->i[k+1] - c->i[k];
2598     c->ilen[k] = c->imax[k] = nn;
2599     c->nonzerorowcnt += (PetscInt)!!nn;
2600     c->rmax = PetscMax(c->rmax,nn);
2601   }
2602   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2603   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2604   Ccsr->num_entries = c->nz;
2605 
2606   C->nonzerostate++;
2607   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2608   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2609   Ccusp->nonzerostate = C->nonzerostate;
2610   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2611   C->preallocated  = PETSC_TRUE;
2612   C->assembled     = PETSC_FALSE;
2613   C->was_assembled = PETSC_FALSE;
2614   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2615     mmdata->reusesym = PETSC_TRUE;
2616     C->offloadmask   = PETSC_OFFLOAD_GPU;
2617   }
2618   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2619   PetscFunctionReturn(0);
2620 }
2621 
2622 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2623 
2624 /* handles sparse or dense B */
2625 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2626 {
2627   Mat_Product    *product = mat->product;
2628   PetscErrorCode ierr;
2629   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2630 
2631   PetscFunctionBegin;
2632   MatCheckProduct(mat,1);
2633   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2634   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2635     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2636   }
2637   if (product->type == MATPRODUCT_ABC) {
2638     Ciscusp = PETSC_FALSE;
2639     if (!product->C->boundtocpu) {
2640       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2641     }
2642   }
2643   if (isdense) {
2644     switch (product->type) {
2645     case MATPRODUCT_AB:
2646     case MATPRODUCT_AtB:
2647     case MATPRODUCT_ABt:
2648     case MATPRODUCT_PtAP:
2649     case MATPRODUCT_RARt:
2650      if (product->A->boundtocpu) {
2651         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2652       } else {
2653         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2654       }
2655       break;
2656     case MATPRODUCT_ABC:
2657       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2658       break;
2659     default:
2660       break;
2661     }
2662   } else if (Biscusp && Ciscusp) {
2663     switch (product->type) {
2664     case MATPRODUCT_AB:
2665     case MATPRODUCT_AtB:
2666     case MATPRODUCT_ABt:
2667       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2668       break;
2669     case MATPRODUCT_PtAP:
2670     case MATPRODUCT_RARt:
2671     case MATPRODUCT_ABC:
2672       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2673       break;
2674     default:
2675       break;
2676     }
2677   } else { /* fallback for AIJ */
2678     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2679   }
2680   PetscFunctionReturn(0);
2681 }
2682 
2683 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2684 {
2685   PetscErrorCode ierr;
2686 
2687   PetscFunctionBegin;
2688   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2689   PetscFunctionReturn(0);
2690 }
2691 
2692 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2693 {
2694   PetscErrorCode ierr;
2695 
2696   PetscFunctionBegin;
2697   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2698   PetscFunctionReturn(0);
2699 }
2700 
2701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2702 {
2703   PetscErrorCode ierr;
2704 
2705   PetscFunctionBegin;
2706   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2707   PetscFunctionReturn(0);
2708 }
2709 
2710 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2711 {
2712   PetscErrorCode ierr;
2713 
2714   PetscFunctionBegin;
2715   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2716   PetscFunctionReturn(0);
2717 }
2718 
2719 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2720 {
2721   PetscErrorCode ierr;
2722 
2723   PetscFunctionBegin;
2724   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2730 {
2731   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2732   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2733   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2734   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2735   PetscErrorCode               ierr;
2736   cudaError_t                  cerr;
2737   cusparseStatus_t             stat;
2738   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2739   PetscBool                    compressed;
2740 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2741   PetscInt                     nx,ny;
2742 #endif
2743 
2744   PetscFunctionBegin;
2745   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2746   if (!a->nonzerorowcnt) {
2747     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2748     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2749     PetscFunctionReturn(0);
2750   }
2751   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2752   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2753   if (!trans) {
2754     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2755     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2756   } else {
2757     if (herm || !cusparsestruct->transgen) {
2758       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2759       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2760     } else {
2761       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2762       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2763     }
2764   }
2765   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2766   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2767 
2768   try {
2769     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2770     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2771     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2772 
2773     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2774     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2775       /* z = A x + beta y.
2776          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2777          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2778       */
2779       xptr = xarray;
2780       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2781       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2782      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2783       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2784           allocated to accommodate different uses. So we get the length info directly from mat.
2785        */
2786       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2787         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2788         nx = mat->num_cols;
2789         ny = mat->num_rows;
2790       }
2791      #endif
2792     } else {
2793       /* z = A^T x + beta y
2794          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2795          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2796        */
2797       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2798       dptr = zarray;
2799       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2800       if (compressed) { /* Scatter x to work vector */
2801         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2802         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2803                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2804                          VecCUDAEqualsReverse());
2805       }
2806      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2807       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2808         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2809         nx = mat->num_rows;
2810         ny = mat->num_cols;
2811       }
2812      #endif
2813     }
2814 
2815     /* csr_spmv does y = alpha op(A) x + beta y */
2816     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2818       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2819       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2820         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2821         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2822         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2823                                 matstruct->matDescr,
2824                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2825                                 matstruct->cuSpMV[opA].vecYDescr,
2826                                 cusparse_scalartype,
2827                                 cusparsestruct->spmvAlg,
2828                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2829         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2830 
2831         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2832       } else {
2833         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2834         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2835         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2836       }
2837 
2838       stat = cusparseSpMV(cusparsestruct->handle, opA,
2839                                matstruct->alpha_one,
2840                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2841                                matstruct->cuSpMV[opA].vecXDescr,
2842                                beta,
2843                                matstruct->cuSpMV[opA].vecYDescr,
2844                                cusparse_scalartype,
2845                                cusparsestruct->spmvAlg,
2846                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2847      #else
2848       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2849       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2850                                mat->num_rows, mat->num_cols,
2851                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2852                                mat->values->data().get(), mat->row_offsets->data().get(),
2853                                mat->column_indices->data().get(), xptr, beta,
2854                                dptr);CHKERRCUSPARSE(stat);
2855      #endif
2856     } else {
2857       if (cusparsestruct->nrows) {
2858        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2859         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2860        #else
2861         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2862         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2863                                  matstruct->alpha_one, matstruct->descr, hybMat,
2864                                  xptr, beta,
2865                                  dptr);CHKERRCUSPARSE(stat);
2866        #endif
2867       }
2868     }
2869     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2870     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2871 
2872     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2873       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2874         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2875           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2876         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2877           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2878         }
2879       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2880         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2881       }
2882 
2883       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2884       if (compressed) {
2885         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2886         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2887         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2888                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2889                          VecCUDAPlusEquals());
2890         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2891         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2892       }
2893     } else {
2894       if (yy && yy != zz) {
2895         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2896       }
2897     }
2898     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2899     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2900     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2901   } catch(char *ex) {
2902     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2903   }
2904   if (yy) {
2905     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2906   } else {
2907     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2908   }
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2913 {
2914   PetscErrorCode ierr;
2915 
2916   PetscFunctionBegin;
2917   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2918   PetscFunctionReturn(0);
2919 }
2920 
2921 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2922 {
2923   PetscErrorCode              ierr;
2924   PetscSplitCSRDataStructure  *d_mat = NULL;
2925   PetscFunctionBegin;
2926   if (A->factortype == MAT_FACTOR_NONE) {
2927     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2928   }
2929   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2930   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2931   if (d_mat) {
2932     A->offloadmask = PETSC_OFFLOAD_GPU;
2933   }
2934 
2935   PetscFunctionReturn(0);
2936 }
2937 
2938 /* --------------------------------------------------------------------------------*/
2939 /*@
2940    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2941    (the default parallel PETSc format). This matrix will ultimately pushed down
2942    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2943    assembly performance the user should preallocate the matrix storage by setting
2944    the parameter nz (or the array nnz).  By setting these parameters accurately,
2945    performance during matrix assembly can be increased by more than a factor of 50.
2946 
2947    Collective
2948 
2949    Input Parameters:
2950 +  comm - MPI communicator, set to PETSC_COMM_SELF
2951 .  m - number of rows
2952 .  n - number of columns
2953 .  nz - number of nonzeros per row (same for all rows)
2954 -  nnz - array containing the number of nonzeros in the various rows
2955          (possibly different for each row) or NULL
2956 
2957    Output Parameter:
2958 .  A - the matrix
2959 
2960    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2961    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2962    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2963 
2964    Notes:
2965    If nnz is given then nz is ignored
2966 
2967    The AIJ format (also called the Yale sparse matrix format or
2968    compressed row storage), is fully compatible with standard Fortran 77
2969    storage.  That is, the stored row and column indices can begin at
2970    either one (as in Fortran) or zero.  See the users' manual for details.
2971 
2972    Specify the preallocated storage with either nz or nnz (not both).
2973    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2974    allocation.  For large problems you MUST preallocate memory or you
2975    will get TERRIBLE performance, see the users' manual chapter on matrices.
2976 
2977    By default, this format uses inodes (identical nodes) when possible, to
2978    improve numerical efficiency of matrix-vector products and solves. We
2979    search for consecutive rows with the same nonzero structure, thereby
2980    reusing matrix information to achieve increased efficiency.
2981 
2982    Level: intermediate
2983 
2984 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2985 @*/
2986 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2987 {
2988   PetscErrorCode ierr;
2989 
2990   PetscFunctionBegin;
2991   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2992   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2993   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2994   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2995   PetscFunctionReturn(0);
2996 }
2997 
2998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2999 {
3000   PetscErrorCode              ierr;
3001   PetscSplitCSRDataStructure  *d_mat = NULL;
3002 
3003   PetscFunctionBegin;
3004   if (A->factortype == MAT_FACTOR_NONE) {
3005     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3006     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3007     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3008   } else {
3009     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3010   }
3011   if (d_mat) {
3012     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3013     cudaError_t                err;
3014     PetscSplitCSRDataStructure h_mat;
3015     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3016     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3017     if (a->compressedrow.use) {
3018       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3019     }
3020     err = cudaFree(d_mat);CHKERRCUDA(err);
3021   }
3022   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3023   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3024   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3025   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3029   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3030   PetscFunctionReturn(0);
3031 }
3032 
3033 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3034 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3035 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3036 {
3037   PetscErrorCode ierr;
3038 
3039   PetscFunctionBegin;
3040   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3041   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3042   PetscFunctionReturn(0);
3043 }
3044 
3045 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
3046 {
3047   PetscErrorCode ierr;
3048   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3049   PetscBool      flgx,flgy;
3050 
3051   PetscFunctionBegin;
3052   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
3053   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
3054   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
3055   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
3056   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
3057   if (!flgx || !flgy) {
3058     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3059     PetscFunctionReturn(0);
3060   }
3061   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
3062   if (str == DIFFERENT_NONZERO_PATTERN) {
3063     if (x->nz == y->nz) {
3064       PetscBool e;
3065       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
3066       if (e) {
3067         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
3068         if (e) {
3069           str = SAME_NONZERO_PATTERN;
3070         }
3071       }
3072     }
3073   }
3074   if (str != SAME_NONZERO_PATTERN) {
3075     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3076     PetscFunctionReturn(0);
3077   } else {
3078     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3079     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
3080     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3081     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3082     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
3083       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
3084         ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3085       }
3086       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
3087         ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
3088       }
3089       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3090       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3091       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3092     } else {
3093       cublasHandle_t cublasv2handle;
3094       cublasStatus_t cberr;
3095       cudaError_t    err;
3096       PetscScalar    alpha = a;
3097       PetscBLASInt   one = 1, bnz = 1;
3098       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
3099       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
3100       PetscScalar    *aa_y, *aa_x;
3101       aa_y = matrix_y->values->data().get();
3102       aa_x = matrix_x->values->data().get();
3103       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3104       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3105       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3106       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
3107       err  = WaitForCUDA();CHKERRCUDA(err);
3108       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3109       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3110       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3111       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
3112       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
3113       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
3114       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3115     }
3116   }
3117   PetscFunctionReturn(0);
3118 }
3119 
3120 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3121 {
3122   PetscErrorCode             ierr;
3123   PetscBool                  both = PETSC_FALSE;
3124   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3125 
3126   PetscFunctionBegin;
3127   if (A->factortype == MAT_FACTOR_NONE) {
3128     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3129     if (spptr->mat) {
3130       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3131       if (matrix->values) {
3132         both = PETSC_TRUE;
3133         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3134       }
3135     }
3136     if (spptr->matTranspose) {
3137       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3138       if (matrix->values) {
3139         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3140       }
3141     }
3142   }
3143   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3144   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3145   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3146   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3147   else A->offloadmask = PETSC_OFFLOAD_CPU;
3148 
3149   PetscFunctionReturn(0);
3150 }
3151 
3152 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3153 {
3154   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3155   PetscErrorCode ierr;
3156 
3157   PetscFunctionBegin;
3158   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3159   if (flg) {
3160     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3161 
3162     A->ops->axpy                      = MatAXPY_SeqAIJ;
3163     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3164     A->ops->mult                      = MatMult_SeqAIJ;
3165     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3166     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3167     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3168     A->ops->multhermitiantranspose    = NULL;
3169     A->ops->multhermitiantransposeadd = NULL;
3170     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3171     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3172     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3173     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3174     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3175     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3176     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3177   } else {
3178     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3179     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3180     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3181     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3182     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3183     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3184     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3185     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3186     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3187     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3188     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3189     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3190     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3191     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3192     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3193   }
3194   A->boundtocpu = flg;
3195   a->inode.use = flg;
3196   PetscFunctionReturn(0);
3197 }
3198 
3199 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3200 {
3201   PetscErrorCode   ierr;
3202   cusparseStatus_t stat;
3203   Mat              B;
3204 
3205   PetscFunctionBegin;
3206   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3207   if (reuse == MAT_INITIAL_MATRIX) {
3208     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3209   } else if (reuse == MAT_REUSE_MATRIX) {
3210     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3211   }
3212   B = *newmat;
3213 
3214   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3215   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3216 
3217   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3218     if (B->factortype == MAT_FACTOR_NONE) {
3219       Mat_SeqAIJCUSPARSE *spptr;
3220 
3221       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3222       spptr->format = MAT_CUSPARSE_CSR;
3223       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3224       B->spptr = spptr;
3225       spptr->deviceMat = NULL;
3226     } else {
3227       Mat_SeqAIJCUSPARSETriFactors *spptr;
3228 
3229       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3230       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3231       B->spptr = spptr;
3232     }
3233     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3234   }
3235   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3236   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3237   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3238   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3239   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3240 
3241   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3242   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3243   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3244   PetscFunctionReturn(0);
3245 }
3246 
3247 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3248 {
3249   PetscErrorCode ierr;
3250 
3251   PetscFunctionBegin;
3252   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3253   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3254   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3255   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3256   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3257   PetscFunctionReturn(0);
3258 }
3259 
3260 /*MC
3261    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3262 
3263    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3264    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3265    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3266 
3267    Options Database Keys:
3268 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3269 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3270 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3271 
3272   Level: beginner
3273 
3274 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3275 M*/
3276 
3277 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3278 
3279 
3280 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3281 {
3282   PetscErrorCode ierr;
3283 
3284   PetscFunctionBegin;
3285   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3286   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3287   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3288   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3289   PetscFunctionReturn(0);
3290 }
3291 
3292 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3293 {
3294   PetscErrorCode   ierr;
3295   cusparseStatus_t stat;
3296 
3297   PetscFunctionBegin;
3298   if (*cusparsestruct) {
3299     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3300     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3301     delete (*cusparsestruct)->workVector;
3302     delete (*cusparsestruct)->rowoffsets_gpu;
3303     delete (*cusparsestruct)->cooPerm;
3304     delete (*cusparsestruct)->cooPerm_a;
3305     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3306    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3307     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3308    #endif
3309     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3310   }
3311   PetscFunctionReturn(0);
3312 }
3313 
3314 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3315 {
3316   PetscFunctionBegin;
3317   if (*mat) {
3318     delete (*mat)->values;
3319     delete (*mat)->column_indices;
3320     delete (*mat)->row_offsets;
3321     delete *mat;
3322     *mat = 0;
3323   }
3324   PetscFunctionReturn(0);
3325 }
3326 
3327 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3328 {
3329   cusparseStatus_t stat;
3330   PetscErrorCode   ierr;
3331 
3332   PetscFunctionBegin;
3333   if (*trifactor) {
3334     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3335     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3336     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3337     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3338     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3339    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3340     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3341    #endif
3342     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3343   }
3344   PetscFunctionReturn(0);
3345 }
3346 
3347 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3348 {
3349   CsrMatrix        *mat;
3350   cusparseStatus_t stat;
3351   cudaError_t      err;
3352 
3353   PetscFunctionBegin;
3354   if (*matstruct) {
3355     if ((*matstruct)->mat) {
3356       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3357        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3358         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3359        #else
3360         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3361         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3362        #endif
3363       } else {
3364         mat = (CsrMatrix*)(*matstruct)->mat;
3365         CsrMatrix_Destroy(&mat);
3366       }
3367     }
3368     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3369     delete (*matstruct)->cprowIndices;
3370     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3371     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3372     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3373 
3374    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3375     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3376     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3377     for (int i=0; i<3; i++) {
3378       if (mdata->cuSpMV[i].initialized) {
3379         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3380         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3381         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3382       }
3383     }
3384    #endif
3385     delete *matstruct;
3386     *matstruct = NULL;
3387   }
3388   PetscFunctionReturn(0);
3389 }
3390 
3391 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3392 {
3393   PetscErrorCode ierr;
3394 
3395   PetscFunctionBegin;
3396   if (*trifactors) {
3397     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3398     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3399     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3400     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3401     delete (*trifactors)->rpermIndices;
3402     delete (*trifactors)->cpermIndices;
3403     delete (*trifactors)->workVector;
3404     (*trifactors)->rpermIndices = NULL;
3405     (*trifactors)->cpermIndices = NULL;
3406     (*trifactors)->workVector = NULL;
3407   }
3408   PetscFunctionReturn(0);
3409 }
3410 
3411 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3412 {
3413   PetscErrorCode   ierr;
3414   cusparseHandle_t handle;
3415   cusparseStatus_t stat;
3416 
3417   PetscFunctionBegin;
3418   if (*trifactors) {
3419     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3420     if (handle = (*trifactors)->handle) {
3421       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3422     }
3423     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3424   }
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 struct IJCompare
3429 {
3430   __host__ __device__
3431   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3432   {
3433     if (t1.get<0>() < t2.get<0>()) return true;
3434     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3435     return false;
3436   }
3437 };
3438 
3439 struct IJEqual
3440 {
3441   __host__ __device__
3442   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3443   {
3444     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3445     return true;
3446   }
3447 };
3448 
3449 struct IJDiff
3450 {
3451   __host__ __device__
3452   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3453   {
3454     return t1 == t2 ? 0 : 1;
3455   }
3456 };
3457 
3458 struct IJSum
3459 {
3460   __host__ __device__
3461   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3462   {
3463     return t1||t2;
3464   }
3465 };
3466 
3467 #include <thrust/iterator/discard_iterator.h>
3468 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3469 {
3470   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3471   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3472   THRUSTARRAY                           *cooPerm_v = NULL;
3473   thrust::device_ptr<const PetscScalar> d_v;
3474   CsrMatrix                             *matrix;
3475   PetscErrorCode                        ierr;
3476   cudaError_t                           cerr;
3477   PetscInt                              n;
3478 
3479   PetscFunctionBegin;
3480   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3481   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3482   if (!cusp->cooPerm) {
3483     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3484     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3485     PetscFunctionReturn(0);
3486   }
3487   matrix = (CsrMatrix*)cusp->mat->mat;
3488   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3489   if (!v) {
3490     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3491     goto finalize;
3492   }
3493   n = cusp->cooPerm->size();
3494   if (isCudaMem(v)) {
3495     d_v = thrust::device_pointer_cast(v);
3496   } else {
3497     cooPerm_v = new THRUSTARRAY(n);
3498     cooPerm_v->assign(v,v+n);
3499     d_v = cooPerm_v->data();
3500     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3501   }
3502   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3503   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3504     if (cusp->cooPerm_a) {
3505       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3506       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3507       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3508       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3509       delete cooPerm_w;
3510     } else {
3511       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3512                                                                 matrix->values->begin()));
3513       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3514                                                                 matrix->values->end()));
3515       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3516     }
3517   } else {
3518     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3519       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3520       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3521     } else {
3522       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3523                                                                 matrix->values->begin()));
3524       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3525                                                                 matrix->values->end()));
3526       thrust::for_each(zibit,zieit,VecCUDAEquals());
3527     }
3528   }
3529   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3530   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3531 finalize:
3532   delete cooPerm_v;
3533   A->offloadmask = PETSC_OFFLOAD_GPU;
3534   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3535   /* shorter version of MatAssemblyEnd_SeqAIJ */
3536   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3537   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3538   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3539   a->reallocs         = 0;
3540   A->info.mallocs    += 0;
3541   A->info.nz_unneeded = 0;
3542   A->assembled = A->was_assembled = PETSC_TRUE;
3543   A->num_ass++;
3544   PetscFunctionReturn(0);
3545 }
3546 
3547 #include <thrust/binary_search.h>
3548 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3549 {
3550   PetscErrorCode     ierr;
3551   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3552   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3553   PetscInt           cooPerm_n, nzr = 0;
3554   cudaError_t        cerr;
3555 
3556   PetscFunctionBegin;
3557   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3558   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3559   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3560   if (n != cooPerm_n) {
3561     delete cusp->cooPerm;
3562     delete cusp->cooPerm_a;
3563     cusp->cooPerm = NULL;
3564     cusp->cooPerm_a = NULL;
3565   }
3566   if (n) {
3567     THRUSTINTARRAY d_i(n);
3568     THRUSTINTARRAY d_j(n);
3569     THRUSTINTARRAY ii(A->rmap->n);
3570 
3571     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3572     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3573 
3574     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3575     d_i.assign(coo_i,coo_i+n);
3576     d_j.assign(coo_j,coo_j+n);
3577     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3578     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3579 
3580     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3581     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3582     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3583     *cusp->cooPerm_a = d_i;
3584     THRUSTINTARRAY w = d_j;
3585 
3586     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3587     if (nekey == ekey) { /* all entries are unique */
3588       delete cusp->cooPerm_a;
3589       cusp->cooPerm_a = NULL;
3590     } else { /* I couldn't come up with a more elegant algorithm */
3591       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3592       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3593       (*cusp->cooPerm_a)[0] = 0;
3594       w[0] = 0;
3595       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3596       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3597     }
3598     thrust::counting_iterator<PetscInt> search_begin(0);
3599     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3600                         search_begin, search_begin + A->rmap->n,
3601                         ii.begin());
3602     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3603     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3604 
3605     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3606     a->singlemalloc = PETSC_FALSE;
3607     a->free_a       = PETSC_TRUE;
3608     a->free_ij      = PETSC_TRUE;
3609     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3610     a->i[0] = 0;
3611     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3612     a->nz = a->maxnz = a->i[A->rmap->n];
3613     a->rmax = 0;
3614     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3615     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3616     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3617     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3618     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3619     for (PetscInt i = 0; i < A->rmap->n; i++) {
3620       const PetscInt nnzr = a->i[i+1] - a->i[i];
3621       nzr += (PetscInt)!!(nnzr);
3622       a->ilen[i] = a->imax[i] = nnzr;
3623       a->rmax = PetscMax(a->rmax,nnzr);
3624     }
3625     a->nonzerorowcnt = nzr;
3626     A->preallocated = PETSC_TRUE;
3627     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3628     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3629   } else {
3630     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3631   }
3632   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3633 
3634   /* We want to allocate the CUSPARSE struct for matvec now.
3635      The code is so convoluted now that I prefer to copy zeros */
3636   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3637   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3638   A->offloadmask = PETSC_OFFLOAD_CPU;
3639   A->nonzerostate++;
3640   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3641   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3642 
3643   A->assembled = PETSC_FALSE;
3644   A->was_assembled = PETSC_FALSE;
3645   PetscFunctionReturn(0);
3646 }
3647 
3648 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3649 {
3650   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3651   CsrMatrix          *csr;
3652   PetscErrorCode     ierr;
3653 
3654   PetscFunctionBegin;
3655   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3656   PetscValidPointer(a,2);
3657   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3658   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3659   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3660   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3661   csr = (CsrMatrix*)cusp->mat->mat;
3662   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3663   *a = csr->values->data().get();
3664   PetscFunctionReturn(0);
3665 }
3666 
3667 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3668 {
3669   PetscFunctionBegin;
3670   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3671   PetscValidPointer(a,2);
3672   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3673   *a = NULL;
3674   PetscFunctionReturn(0);
3675 }
3676 
3677 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3678 {
3679   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3680   CsrMatrix          *csr;
3681 
3682   PetscFunctionBegin;
3683   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3684   PetscValidPointer(a,2);
3685   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3686   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3687   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3688   csr = (CsrMatrix*)cusp->mat->mat;
3689   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3690   *a = csr->values->data().get();
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3695 {
3696   PetscErrorCode ierr;
3697 
3698   PetscFunctionBegin;
3699   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3700   PetscValidPointer(a,2);
3701   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3702   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3703   A->offloadmask = PETSC_OFFLOAD_GPU;
3704   *a = NULL;
3705   PetscFunctionReturn(0);
3706 }
3707 
3708 struct IJCompare4
3709 {
3710   __host__ __device__
3711   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3712   {
3713     if (t1.get<0>() < t2.get<0>()) return true;
3714     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3715     return false;
3716   }
3717 };
3718 
3719 struct Shift
3720 {
3721   int _shift;
3722 
3723   Shift(int shift) : _shift(shift) {}
3724   __host__ __device__
3725   inline int operator() (const int &c)
3726   {
3727     return c + _shift;
3728   }
3729 };
3730 
3731 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3732 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3733 {
3734   PetscErrorCode               ierr;
3735   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3736   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3737   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3738   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3739   PetscInt                     Annz,Bnnz;
3740   cusparseStatus_t             stat;
3741   PetscInt                     i,m,n,zero = 0;
3742   cudaError_t                  cerr;
3743 
3744   PetscFunctionBegin;
3745   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3746   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3747   PetscValidPointer(C,4);
3748   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3749   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3750   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3751   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3752   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3753   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3754   if (reuse == MAT_INITIAL_MATRIX) {
3755     m     = A->rmap->n;
3756     n     = A->cmap->n + B->cmap->n;
3757     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3758     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3759     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3760     c     = (Mat_SeqAIJ*)(*C)->data;
3761     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3762     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3763     Ccsr  = new CsrMatrix;
3764     Cmat->cprowIndices      = NULL;
3765     c->compressedrow.use    = PETSC_FALSE;
3766     c->compressedrow.nrows  = 0;
3767     c->compressedrow.i      = NULL;
3768     c->compressedrow.rindex = NULL;
3769     Ccusp->workVector       = NULL;
3770     Ccusp->nrows    = m;
3771     Ccusp->mat      = Cmat;
3772     Ccusp->mat->mat = Ccsr;
3773     Ccsr->num_rows  = m;
3774     Ccsr->num_cols  = n;
3775     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3776     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3777     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3778     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3779     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3780     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3781     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3782     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3783     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3784     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3785     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3786     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3787     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3788     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3789     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3790 
3791     Acsr = (CsrMatrix*)Acusp->mat->mat;
3792     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3793     Annz = (PetscInt)Acsr->column_indices->size();
3794     Bnnz = (PetscInt)Bcsr->column_indices->size();
3795     c->nz = Annz + Bnnz;
3796     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3797     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3798     Ccsr->values = new THRUSTARRAY(c->nz);
3799     Ccsr->num_entries = c->nz;
3800     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3801     if (c->nz) {
3802       auto Acoo = new THRUSTINTARRAY32(Annz);
3803       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3804       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3805       THRUSTINTARRAY32 *Aroff,*Broff;
3806 
3807       if (a->compressedrow.use) { /* need full row offset */
3808         if (!Acusp->rowoffsets_gpu) {
3809           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3810           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3811           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3812         }
3813         Aroff = Acusp->rowoffsets_gpu;
3814       } else Aroff = Acsr->row_offsets;
3815       if (b->compressedrow.use) { /* need full row offset */
3816         if (!Bcusp->rowoffsets_gpu) {
3817           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3818           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3819           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3820         }
3821         Broff = Bcusp->rowoffsets_gpu;
3822       } else Broff = Bcsr->row_offsets;
3823       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3824       stat = cusparseXcsr2coo(Acusp->handle,
3825                               Aroff->data().get(),
3826                               Annz,
3827                               m,
3828                               Acoo->data().get(),
3829                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3830       stat = cusparseXcsr2coo(Bcusp->handle,
3831                               Broff->data().get(),
3832                               Bnnz,
3833                               m,
3834                               Bcoo->data().get(),
3835                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3836       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3837       auto Aperm = thrust::make_constant_iterator(1);
3838       auto Bperm = thrust::make_constant_iterator(0);
3839 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3840       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3841       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3842 #else
3843       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3844       auto Bcib = Bcsr->column_indices->begin();
3845       auto Bcie = Bcsr->column_indices->end();
3846       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3847 #endif
3848       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3849       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3850       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3851       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3852       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3853       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3854       auto p1 = Ccusp->cooPerm->begin();
3855       auto p2 = Ccusp->cooPerm->begin();
3856       thrust::advance(p2,Annz);
3857       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3858 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3859       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3860 #endif
3861       auto cci = thrust::make_counting_iterator(zero);
3862       auto cce = thrust::make_counting_iterator(c->nz);
3863 #if 0 //Errors on SUMMIT cuda 11.1.0
3864       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3865 #else
3866       auto pred = thrust::identity<int>();
3867       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3868       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3869 #endif
3870       stat = cusparseXcoo2csr(Ccusp->handle,
3871                               Ccoo->data().get(),
3872                               c->nz,
3873                               m,
3874                               Ccsr->row_offsets->data().get(),
3875                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3876       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3877       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3878       delete wPerm;
3879       delete Acoo;
3880       delete Bcoo;
3881       delete Ccoo;
3882 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3883       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3884                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3885                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3886                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3887 #endif
3888       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3889         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3890         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3891         CsrMatrix *CcsrT = new CsrMatrix;
3892         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3893         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3894 
3895         Ccusp->transgen = PETSC_TRUE;
3896         CmatT->cprowIndices  = NULL;
3897         CmatT->mat = CcsrT;
3898         CcsrT->num_rows = n;
3899         CcsrT->num_cols = m;
3900         CcsrT->num_entries = c->nz;
3901 
3902         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3903         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3904         CcsrT->values = new THRUSTARRAY(c->nz);
3905 
3906         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3907         auto rT = CcsrT->row_offsets->begin();
3908         if (AT) {
3909           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3910           thrust::advance(rT,-1);
3911         }
3912         if (BT) {
3913           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3914           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3915           thrust::copy(titb,tite,rT);
3916         }
3917         auto cT = CcsrT->column_indices->begin();
3918         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3919         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3920         auto vT = CcsrT->values->begin();
3921         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3922         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3923         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3924         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3925 
3926         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3927         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3928         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3929         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3930         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3931         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3932         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3933         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3934         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3936         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3937                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3938                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3939                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3940 #endif
3941         Ccusp->matTranspose = CmatT;
3942       }
3943     }
3944 
3945     c->singlemalloc = PETSC_FALSE;
3946     c->free_a       = PETSC_TRUE;
3947     c->free_ij      = PETSC_TRUE;
3948     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
3949     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
3950     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3951       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3952       THRUSTINTARRAY jj(Ccsr->column_indices->size());
3953       ii   = *Ccsr->row_offsets;
3954       jj   = *Ccsr->column_indices;
3955       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3956       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3957     } else {
3958       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3959       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3960     }
3961     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
3962     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
3963     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
3964     c->maxnz = c->nz;
3965     c->nonzerorowcnt = 0;
3966     c->rmax = 0;
3967     for (i = 0; i < m; i++) {
3968       const PetscInt nn = c->i[i+1] - c->i[i];
3969       c->ilen[i] = c->imax[i] = nn;
3970       c->nonzerorowcnt += (PetscInt)!!nn;
3971       c->rmax = PetscMax(c->rmax,nn);
3972     }
3973     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
3974     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
3975     (*C)->nonzerostate++;
3976     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
3977     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
3978     Ccusp->nonzerostate = (*C)->nonzerostate;
3979     (*C)->preallocated  = PETSC_TRUE;
3980   } else {
3981     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
3982     c = (Mat_SeqAIJ*)(*C)->data;
3983     if (c->nz) {
3984       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3985       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
3986       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3987       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
3988       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3989       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3990       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3991       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3992       Acsr = (CsrMatrix*)Acusp->mat->mat;
3993       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3994       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
3995       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
3996       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
3997       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
3998       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
3999       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4000       auto pmid = Ccusp->cooPerm->begin();
4001       thrust::advance(pmid,Acsr->num_entries);
4002       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4003       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4004                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4005       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4006                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4007       thrust::for_each(zibait,zieait,VecCUDAEquals());
4008       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4009                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4010       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4011                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4012       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4013       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4014         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4015         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4016         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4017         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4018         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4019         auto vT = CcsrT->values->begin();
4020         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4021         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4022       }
4023       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4024       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4025     }
4026   }
4027   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4028   (*C)->assembled     = PETSC_TRUE;
4029   (*C)->was_assembled = PETSC_FALSE;
4030   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4031   PetscFunctionReturn(0);
4032 }
4033