xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f93f85713de9e9013005b55e898d32f7c1354932)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
96 {
97   PetscFunctionBegin;
98   *type = MATSOLVERCUSPARSE;
99   PetscFunctionReturn(0);
100 }
101 
102 /*MC
103   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108   algorithms are not recommended. This class does NOT support direct solver operations.
109 
110   Level: beginner
111 
112 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
113 M*/
114 
115 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
116 {
117   PetscInt       n = A->rmap->n;
118 
119   PetscFunctionBegin;
120   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
121   PetscCall(MatSetSizes(*B,n,n,n,n));
122   (*B)->factortype = ftype;
123   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
124 
125   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
127     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
128     if (!A->boundtocpu) {
129       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
130       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
131     } else {
132       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
133       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
134     }
135     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
136     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
137     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
139     if (!A->boundtocpu) {
140       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
142     } else {
143       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
144       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
145     }
146     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
147     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
148   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149 
150   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
151   (*B)->canuseordering = PETSC_TRUE;
152   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
153   PetscFunctionReturn(0);
154 }
155 
156 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157 {
158   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
159 
160   PetscFunctionBegin;
161   switch (op) {
162   case MAT_CUSPARSE_MULT:
163     cusparsestruct->format = format;
164     break;
165   case MAT_CUSPARSE_ALL:
166     cusparsestruct->format = format;
167     break;
168   default:
169     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170   }
171   PetscFunctionReturn(0);
172 }
173 
174 /*@
175    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176    operation. Only the MatMult operation can use different GPU storage formats
177    for MPIAIJCUSPARSE matrices.
178    Not Collective
179 
180    Input Parameters:
181 +  A - Matrix of type SEQAIJCUSPARSE
182 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
183 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184 
185    Output Parameter:
186 
187    Level: intermediate
188 
189 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
190 @*/
191 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192 {
193   PetscFunctionBegin;
194   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196   PetscFunctionReturn(0);
197 }
198 
199 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200 {
201   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203   PetscFunctionBegin;
204   cusparsestruct->use_cpu_solve = use_cpu;
205   PetscFunctionReturn(0);
206 }
207 
208 /*@
209    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210 
211    Input Parameters:
212 +  A - Matrix of type SEQAIJCUSPARSE
213 -  use_cpu - set flag for using the built-in CPU MatSolve
214 
215    Output Parameter:
216 
217    Notes:
218    The cuSparse LU solver currently computes the factors with the built-in CPU method
219    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221 
222    Level: intermediate
223 
224 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
225 @*/
226 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227 {
228   PetscFunctionBegin;
229   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231   PetscFunctionReturn(0);
232 }
233 
234 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235 {
236   PetscFunctionBegin;
237   switch (op) {
238     case MAT_FORM_EXPLICIT_TRANSPOSE:
239       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
240       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
241       A->form_explicit_transpose = flg;
242       break;
243     default:
244       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
245       break;
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251 
252 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253 {
254   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255   IS             isrow = b->row,iscol = b->col;
256   PetscBool      row_identity,col_identity;
257   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258 
259   PetscFunctionBegin;
260   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
261   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262   B->offloadmask = PETSC_OFFLOAD_CPU;
263   /* determine which version of MatSolve needs to be used. */
264   PetscCall(ISIdentity(isrow,&row_identity));
265   PetscCall(ISIdentity(iscol,&col_identity));
266 
267   if (!cusparsestruct->use_cpu_solve) {
268     if (row_identity && col_identity) {
269       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
270       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
271     } else {
272       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
273       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
274     }
275   }
276   B->ops->matsolve = NULL;
277   B->ops->matsolvetranspose = NULL;
278 
279   /* get the triangular factors */
280   if (!cusparsestruct->use_cpu_solve) {
281     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
282   }
283   PetscFunctionReturn(0);
284 }
285 
286 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
287 {
288   MatCUSPARSEStorageFormat format;
289   PetscBool                flg;
290   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
291 
292   PetscFunctionBegin;
293   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
294   if (A->factortype == MAT_FACTOR_NONE) {
295     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
296                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
297     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
298 
299     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
300                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
301     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
302     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
303     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
304 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
305     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
306                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
307     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
308 #if CUSPARSE_VERSION > 11301
309     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
310 #else
311     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
312 #endif
313     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
314                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
315     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
316 
317     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
318                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
319     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
320    #endif
321   }
322   PetscOptionsHeadEnd();
323   PetscFunctionReturn(0);
324 }
325 
326 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
327 {
328   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
329 
330   PetscFunctionBegin;
331   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
332   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
333   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
334   PetscFunctionReturn(0);
335 }
336 
337 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
338 {
339   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340 
341   PetscFunctionBegin;
342   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
343   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
344   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
345   PetscFunctionReturn(0);
346 }
347 
348 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
349 {
350   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
351 
352   PetscFunctionBegin;
353   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
354   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
355   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
356   PetscFunctionReturn(0);
357 }
358 
359 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
360 {
361   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
362 
363   PetscFunctionBegin;
364   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
365   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
366   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
367   PetscFunctionReturn(0);
368 }
369 
370 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
371 {
372   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
373   PetscInt                          n = A->rmap->n;
374   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
375   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
376   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
377   const MatScalar                   *aa = a->a,*v;
378   PetscInt                          *AiLo, *AjLo;
379   PetscInt                          i,nz, nzLower, offset, rowOffset;
380 
381   PetscFunctionBegin;
382   if (!n) PetscFunctionReturn(0);
383   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
384     try {
385       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
386       nzLower=n+ai[n]-ai[1];
387       if (!loTriFactor) {
388         PetscScalar                       *AALo;
389 
390         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
391 
392         /* Allocate Space for the lower triangular matrix */
393         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
394         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
395 
396         /* Fill the lower triangular matrix */
397         AiLo[0]  = (PetscInt) 0;
398         AiLo[n]  = nzLower;
399         AjLo[0]  = (PetscInt) 0;
400         AALo[0]  = (MatScalar) 1.0;
401         v        = aa;
402         vi       = aj;
403         offset   = 1;
404         rowOffset= 1;
405         for (i=1; i<n; i++) {
406           nz = ai[i+1] - ai[i];
407           /* additional 1 for the term on the diagonal */
408           AiLo[i]    = rowOffset;
409           rowOffset += nz+1;
410 
411           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
412           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
413 
414           offset      += nz;
415           AjLo[offset] = (PetscInt) i;
416           AALo[offset] = (MatScalar) 1.0;
417           offset      += 1;
418 
419           v  += nz;
420           vi += nz;
421         }
422 
423         /* allocate space for the triangular factor information */
424         PetscCall(PetscNew(&loTriFactor));
425         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
426         /* Create the matrix description */
427         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
428         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
429        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
430         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
431        #else
432         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
433        #endif
434         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
435         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
436 
437         /* set the operation */
438         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
439 
440         /* set the matrix */
441         loTriFactor->csrMat = new CsrMatrix;
442         loTriFactor->csrMat->num_rows = n;
443         loTriFactor->csrMat->num_cols = n;
444         loTriFactor->csrMat->num_entries = nzLower;
445 
446         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
447         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
448 
449         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
450         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
451 
452         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
453         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
454 
455         /* Create the solve analysis information */
456         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
457         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
458       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
459         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
460                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
461                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
462                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
463                                                &loTriFactor->solveBufferSize));
464         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
465       #endif
466 
467         /* perform the solve analysis */
468         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
469                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
470                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
471                                          loTriFactor->csrMat->column_indices->data().get(),
472                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
473                                          loTriFactor->solveInfo,
474                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
475                                          #else
476                                          loTriFactor->solveInfo));
477                                          #endif
478         PetscCallCUDA(WaitForCUDA());
479         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
480 
481         /* assign the pointer */
482         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
483         loTriFactor->AA_h = AALo;
484         PetscCallCUDA(cudaFreeHost(AiLo));
485         PetscCallCUDA(cudaFreeHost(AjLo));
486         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
487       } else { /* update values only */
488         if (!loTriFactor->AA_h) {
489           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
490         }
491         /* Fill the lower triangular matrix */
492         loTriFactor->AA_h[0]  = 1.0;
493         v        = aa;
494         vi       = aj;
495         offset   = 1;
496         for (i=1; i<n; i++) {
497           nz = ai[i+1] - ai[i];
498           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
499           offset      += nz;
500           loTriFactor->AA_h[offset] = 1.0;
501           offset      += 1;
502           v  += nz;
503         }
504         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
505         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
506       }
507     } catch(char *ex) {
508       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
509     }
510   }
511   PetscFunctionReturn(0);
512 }
513 
514 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
515 {
516   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
517   PetscInt                          n = A->rmap->n;
518   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
519   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
520   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
521   const MatScalar                   *aa = a->a,*v;
522   PetscInt                          *AiUp, *AjUp;
523   PetscInt                          i,nz, nzUpper, offset;
524 
525   PetscFunctionBegin;
526   if (!n) PetscFunctionReturn(0);
527   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528     try {
529       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530       nzUpper = adiag[0]-adiag[n];
531       if (!upTriFactor) {
532         PetscScalar *AAUp;
533 
534         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
535 
536         /* Allocate Space for the upper triangular matrix */
537         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
538         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
539 
540         /* Fill the upper triangular matrix */
541         AiUp[0]=(PetscInt) 0;
542         AiUp[n]=nzUpper;
543         offset = nzUpper;
544         for (i=n-1; i>=0; i--) {
545           v  = aa + adiag[i+1] + 1;
546           vi = aj + adiag[i+1] + 1;
547 
548           /* number of elements NOT on the diagonal */
549           nz = adiag[i] - adiag[i+1]-1;
550 
551           /* decrement the offset */
552           offset -= (nz+1);
553 
554           /* first, set the diagonal elements */
555           AjUp[offset] = (PetscInt) i;
556           AAUp[offset] = (MatScalar)1./v[nz];
557           AiUp[i]      = AiUp[i+1] - (nz+1);
558 
559           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
560           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
561         }
562 
563         /* allocate space for the triangular factor information */
564         PetscCall(PetscNew(&upTriFactor));
565         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
566 
567         /* Create the matrix description */
568         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
569         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
570        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
572        #else
573         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
574        #endif
575         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
576         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
577 
578         /* set the operation */
579         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580 
581         /* set the matrix */
582         upTriFactor->csrMat = new CsrMatrix;
583         upTriFactor->csrMat->num_rows = n;
584         upTriFactor->csrMat->num_cols = n;
585         upTriFactor->csrMat->num_entries = nzUpper;
586 
587         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589 
590         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592 
593         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595 
596         /* Create the solve analysis information */
597         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
598         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
599       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604                                                &upTriFactor->solveBufferSize));
605         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
606       #endif
607 
608         /* perform the solve analysis */
609         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                          upTriFactor->csrMat->column_indices->data().get(),
613                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614                                          upTriFactor->solveInfo,
615                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
616                                          #else
617                                          upTriFactor->solveInfo));
618                                          #endif
619         PetscCallCUDA(WaitForCUDA());
620         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
621 
622         /* assign the pointer */
623         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
624         upTriFactor->AA_h = AAUp;
625         PetscCallCUDA(cudaFreeHost(AiUp));
626         PetscCallCUDA(cudaFreeHost(AjUp));
627         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
628       } else {
629         if (!upTriFactor->AA_h) {
630           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
631         }
632         /* Fill the upper triangular matrix */
633         offset = nzUpper;
634         for (i=n-1; i>=0; i--) {
635           v  = aa + adiag[i+1] + 1;
636 
637           /* number of elements NOT on the diagonal */
638           nz = adiag[i] - adiag[i+1]-1;
639 
640           /* decrement the offset */
641           offset -= (nz+1);
642 
643           /* first, set the diagonal elements */
644           upTriFactor->AA_h[offset] = 1./v[nz];
645           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
646         }
647         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
648         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
649       }
650     } catch(char *ex) {
651       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
652     }
653   }
654   PetscFunctionReturn(0);
655 }
656 
657 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
658 {
659   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
660   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
661   IS                           isrow = a->row,iscol = a->icol;
662   PetscBool                    row_identity,col_identity;
663   PetscInt                     n = A->rmap->n;
664 
665   PetscFunctionBegin;
666   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
667   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
668   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
669 
670   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
671   cusparseTriFactors->nnz=a->nz;
672 
673   A->offloadmask = PETSC_OFFLOAD_BOTH;
674   /* lower triangular indices */
675   PetscCall(ISIdentity(isrow,&row_identity));
676   if (!row_identity && !cusparseTriFactors->rpermIndices) {
677     const PetscInt *r;
678 
679     PetscCall(ISGetIndices(isrow,&r));
680     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
681     cusparseTriFactors->rpermIndices->assign(r, r+n);
682     PetscCall(ISRestoreIndices(isrow,&r));
683     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
684   }
685 
686   /* upper triangular indices */
687   PetscCall(ISIdentity(iscol,&col_identity));
688   if (!col_identity && !cusparseTriFactors->cpermIndices) {
689     const PetscInt *c;
690 
691     PetscCall(ISGetIndices(iscol,&c));
692     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
693     cusparseTriFactors->cpermIndices->assign(c, c+n);
694     PetscCall(ISRestoreIndices(iscol,&c));
695     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
701 {
702   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
703   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706   PetscInt                          *AiUp, *AjUp;
707   PetscScalar                       *AAUp;
708   PetscScalar                       *AALo;
709   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
710   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
711   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
712   const MatScalar                   *aa = b->a,*v;
713 
714   PetscFunctionBegin;
715   if (!n) PetscFunctionReturn(0);
716   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
717     try {
718       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
719       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
720       if (!upTriFactor && !loTriFactor) {
721         /* Allocate Space for the upper triangular matrix */
722         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
723         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
724 
725         /* Fill the upper triangular matrix */
726         AiUp[0]=(PetscInt) 0;
727         AiUp[n]=nzUpper;
728         offset = 0;
729         for (i=0; i<n; i++) {
730           /* set the pointers */
731           v  = aa + ai[i];
732           vj = aj + ai[i];
733           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
734 
735           /* first, set the diagonal elements */
736           AjUp[offset] = (PetscInt) i;
737           AAUp[offset] = (MatScalar)1.0/v[nz];
738           AiUp[i]      = offset;
739           AALo[offset] = (MatScalar)1.0/v[nz];
740 
741           offset+=1;
742           if (nz>0) {
743             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
744             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
745             for (j=offset; j<offset+nz; j++) {
746               AAUp[j] = -AAUp[j];
747               AALo[j] = AAUp[j]/v[nz];
748             }
749             offset+=nz;
750           }
751         }
752 
753         /* allocate space for the triangular factor information */
754         PetscCall(PetscNew(&upTriFactor));
755         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
756 
757         /* Create the matrix description */
758         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
759         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
760        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
762        #else
763         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
764        #endif
765         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
766         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
767 
768         /* set the matrix */
769         upTriFactor->csrMat = new CsrMatrix;
770         upTriFactor->csrMat->num_rows = A->rmap->n;
771         upTriFactor->csrMat->num_cols = A->cmap->n;
772         upTriFactor->csrMat->num_entries = a->nz;
773 
774         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
775         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
776 
777         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
778         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
779 
780         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
781         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
782 
783         /* set the operation */
784         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
785 
786         /* Create the solve analysis information */
787         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
788         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
789       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
790         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
791                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
792                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
793                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
794                                                &upTriFactor->solveBufferSize));
795         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
796       #endif
797 
798         /* perform the solve analysis */
799         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
800                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
801                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
802                                          upTriFactor->csrMat->column_indices->data().get(),
803                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804                                          upTriFactor->solveInfo,
805                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
806                                          #else
807                                          upTriFactor->solveInfo));
808                                          #endif
809         PetscCallCUDA(WaitForCUDA());
810         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
811 
812         /* assign the pointer */
813         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814 
815         /* allocate space for the triangular factor information */
816         PetscCall(PetscNew(&loTriFactor));
817         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818 
819         /* Create the matrix description */
820         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
821         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
822        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
824        #else
825         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
826        #endif
827         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
828         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
829 
830         /* set the operation */
831         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832 
833         /* set the matrix */
834         loTriFactor->csrMat = new CsrMatrix;
835         loTriFactor->csrMat->num_rows = A->rmap->n;
836         loTriFactor->csrMat->num_cols = A->cmap->n;
837         loTriFactor->csrMat->num_entries = a->nz;
838 
839         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847 
848         /* Create the solve analysis information */
849         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
850         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
851       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856                                                &loTriFactor->solveBufferSize));
857         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
858       #endif
859 
860         /* perform the solve analysis */
861         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864                                          loTriFactor->csrMat->column_indices->data().get(),
865                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866                                          loTriFactor->solveInfo,
867                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
868                                          #else
869                                          loTriFactor->solveInfo));
870                                          #endif
871         PetscCallCUDA(WaitForCUDA());
872         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
873 
874         /* assign the pointer */
875         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
876 
877         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
878         PetscCallCUDA(cudaFreeHost(AiUp));
879         PetscCallCUDA(cudaFreeHost(AjUp));
880       } else {
881         /* Fill the upper triangular matrix */
882         offset = 0;
883         for (i=0; i<n; i++) {
884           /* set the pointers */
885           v  = aa + ai[i];
886           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
887 
888           /* first, set the diagonal elements */
889           AAUp[offset] = 1.0/v[nz];
890           AALo[offset] = 1.0/v[nz];
891 
892           offset+=1;
893           if (nz>0) {
894             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
895             for (j=offset; j<offset+nz; j++) {
896               AAUp[j] = -AAUp[j];
897               AALo[j] = AAUp[j]/v[nz];
898             }
899             offset+=nz;
900           }
901         }
902         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
903         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
904         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
905         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
906         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
907       }
908       PetscCallCUDA(cudaFreeHost(AAUp));
909       PetscCallCUDA(cudaFreeHost(AALo));
910     } catch(char *ex) {
911       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
912     }
913   }
914   PetscFunctionReturn(0);
915 }
916 
917 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
918 {
919   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
920   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
921   IS                           ip = a->row;
922   PetscBool                    perm_identity;
923   PetscInt                     n = A->rmap->n;
924 
925   PetscFunctionBegin;
926   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
927   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
928   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
929   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
930 
931   A->offloadmask = PETSC_OFFLOAD_BOTH;
932 
933   /* lower triangular indices */
934   PetscCall(ISIdentity(ip,&perm_identity));
935   if (!perm_identity) {
936     IS             iip;
937     const PetscInt *irip,*rip;
938 
939     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
940     PetscCall(ISGetIndices(iip,&irip));
941     PetscCall(ISGetIndices(ip,&rip));
942     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
943     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
944     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
945     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
946     PetscCall(ISRestoreIndices(iip,&irip));
947     PetscCall(ISDestroy(&iip));
948     PetscCall(ISRestoreIndices(ip,&rip));
949     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
950   }
951   PetscFunctionReturn(0);
952 }
953 
954 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
955 {
956   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
957   IS             ip = b->row;
958   PetscBool      perm_identity;
959 
960   PetscFunctionBegin;
961   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
962   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
963   B->offloadmask = PETSC_OFFLOAD_CPU;
964   /* determine which version of MatSolve needs to be used. */
965   PetscCall(ISIdentity(ip,&perm_identity));
966   if (perm_identity) {
967     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
969     B->ops->matsolve = NULL;
970     B->ops->matsolvetranspose = NULL;
971   } else {
972     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
974     B->ops->matsolve = NULL;
975     B->ops->matsolvetranspose = NULL;
976   }
977 
978   /* get the triangular factors */
979   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984 {
985   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990   cusparseIndexBase_t               indexBase;
991   cusparseMatrixType_t              matrixType;
992   cusparseFillMode_t                fillMode;
993   cusparseDiagType_t                diagType;
994 
995   PetscFunctionBegin;
996   /* allocate space for the transpose of the lower triangular factor */
997   PetscCall(PetscNew(&loTriFactorT));
998   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
999 
1000   /* set the matrix descriptors of the lower triangular factor */
1001   matrixType = cusparseGetMatType(loTriFactor->descr);
1002   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1003   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1006 
1007   /* Create the matrix description */
1008   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1009   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1010   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1011   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1012   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1013 
1014   /* set the operation */
1015   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1016 
1017   /* allocate GPU space for the CSC of the lower triangular factor*/
1018   loTriFactorT->csrMat = new CsrMatrix;
1019   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1020   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1021   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1022   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1023   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1024   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1025 
1026   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1027 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1028   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1029                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1030                                                loTriFactor->csrMat->values->data().get(),
1031                                                loTriFactor->csrMat->row_offsets->data().get(),
1032                                                loTriFactor->csrMat->column_indices->data().get(),
1033                                                loTriFactorT->csrMat->values->data().get(),
1034                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1036                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1037   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1038 #endif
1039 
1040   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1041   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043                                   loTriFactor->csrMat->values->data().get(),
1044                                   loTriFactor->csrMat->row_offsets->data().get(),
1045                                   loTriFactor->csrMat->column_indices->data().get(),
1046                                   loTriFactorT->csrMat->values->data().get(),
1047                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1048                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1049                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1050                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1051                                   #else
1052                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1053                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1054                                   #endif
1055   PetscCallCUDA(WaitForCUDA());
1056   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1057 
1058   /* Create the solve analysis information */
1059   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1060   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1061 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1062   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1063                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1064                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1065                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1066                                          &loTriFactorT->solveBufferSize));
1067   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1068 #endif
1069 
1070   /* perform the solve analysis */
1071   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1072                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1073                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1074                                    loTriFactorT->csrMat->column_indices->data().get(),
1075                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1076                                    loTriFactorT->solveInfo,
1077                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1078                                    #else
1079                                    loTriFactorT->solveInfo));
1080                                    #endif
1081   PetscCallCUDA(WaitForCUDA());
1082   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1083 
1084   /* assign the pointer */
1085   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1086 
1087   /*********************************************/
1088   /* Now the Transpose of the Upper Tri Factor */
1089   /*********************************************/
1090 
1091   /* allocate space for the transpose of the upper triangular factor */
1092   PetscCall(PetscNew(&upTriFactorT));
1093   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1094 
1095   /* set the matrix descriptors of the upper triangular factor */
1096   matrixType = cusparseGetMatType(upTriFactor->descr);
1097   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1098   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1099     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1100   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1101 
1102   /* Create the matrix description */
1103   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1104   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1105   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1106   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1107   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1108 
1109   /* set the operation */
1110   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1111 
1112   /* allocate GPU space for the CSC of the upper triangular factor*/
1113   upTriFactorT->csrMat = new CsrMatrix;
1114   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1115   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1116   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1117   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1118   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1119   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1120 
1121   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1122 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1123   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1124                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1125                                                upTriFactor->csrMat->values->data().get(),
1126                                                upTriFactor->csrMat->row_offsets->data().get(),
1127                                                upTriFactor->csrMat->column_indices->data().get(),
1128                                                upTriFactorT->csrMat->values->data().get(),
1129                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1130                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1131                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1132   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1133 #endif
1134 
1135   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1136   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1137                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138                                   upTriFactor->csrMat->values->data().get(),
1139                                   upTriFactor->csrMat->row_offsets->data().get(),
1140                                   upTriFactor->csrMat->column_indices->data().get(),
1141                                   upTriFactorT->csrMat->values->data().get(),
1142                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1143                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1145                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1146                                   #else
1147                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1148                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1149                                  #endif
1150 
1151   PetscCallCUDA(WaitForCUDA());
1152   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1153 
1154   /* Create the solve analysis information */
1155   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1156   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1157   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1158   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1159                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1160                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1162                                          &upTriFactorT->solveBufferSize));
1163   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1164   #endif
1165 
1166   /* perform the solve analysis */
1167   /* christ, would it have killed you to put this stuff in a function????????? */
1168   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                    upTriFactorT->csrMat->column_indices->data().get(),
1172                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173                                    upTriFactorT->solveInfo,
1174                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1175                                    #else
1176                                    upTriFactorT->solveInfo));
1177                                    #endif
1178 
1179   PetscCallCUDA(WaitForCUDA());
1180   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1181 
1182   /* assign the pointer */
1183   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1184   PetscFunctionReturn(0);
1185 }
1186 
1187 struct PetscScalarToPetscInt
1188 {
1189   __host__ __device__
1190   PetscInt operator()(PetscScalar s)
1191   {
1192     return (PetscInt)PetscRealPart(s);
1193   }
1194 };
1195 
1196 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1197 {
1198   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1199   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1200   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1201   cusparseStatus_t             stat;
1202   cusparseIndexBase_t          indexBase;
1203 
1204   PetscFunctionBegin;
1205   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1206   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1207   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1208   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1209   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1210   if (A->transupdated) PetscFunctionReturn(0);
1211   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1212   PetscCall(PetscLogGpuTimeBegin());
1213   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1214     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1215   }
1216   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1218     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1219     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1220     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1221     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1222 
1223     /* set alpha and beta */
1224     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1225     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1226     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1227     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1228     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1229     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1230 
1231     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       CsrMatrix *matrixT = new CsrMatrix;
1233       matstructT->mat = matrixT;
1234       matrixT->num_rows = A->cmap->n;
1235       matrixT->num_cols = A->rmap->n;
1236       matrixT->num_entries = a->nz;
1237       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239       matrixT->values = new THRUSTARRAY(a->nz);
1240 
1241       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243 
1244      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1246         stat = cusparseCreateCsr(&matstructT->matDescr,
1247                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1248                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1249                                matrixT->values->data().get(),
1250                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1251                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1252       #else
1253         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1254            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1255 
1256            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1257            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1258            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1259         */
1260         if (matrixT->num_entries) {
1261           stat = cusparseCreateCsr(&matstructT->matDescr,
1262                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1263                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1264                                  matrixT->values->data().get(),
1265                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1266                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1267 
1268         } else {
1269           matstructT->matDescr = NULL;
1270           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1271         }
1272       #endif
1273      #endif
1274     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1275    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1277    #else
1278       CsrMatrix *temp  = new CsrMatrix;
1279       CsrMatrix *tempT = new CsrMatrix;
1280       /* First convert HYB to CSR */
1281       temp->num_rows = A->rmap->n;
1282       temp->num_cols = A->cmap->n;
1283       temp->num_entries = a->nz;
1284       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1285       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1286       temp->values = new THRUSTARRAY(a->nz);
1287 
1288       stat = cusparse_hyb2csr(cusparsestruct->handle,
1289                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1290                               temp->values->data().get(),
1291                               temp->row_offsets->data().get(),
1292                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1293 
1294       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1295       tempT->num_rows = A->rmap->n;
1296       tempT->num_cols = A->cmap->n;
1297       tempT->num_entries = a->nz;
1298       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1300       tempT->values = new THRUSTARRAY(a->nz);
1301 
1302       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1303                               temp->num_cols, temp->num_entries,
1304                               temp->values->data().get(),
1305                               temp->row_offsets->data().get(),
1306                               temp->column_indices->data().get(),
1307                               tempT->values->data().get(),
1308                               tempT->column_indices->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1311 
1312       /* Last, convert CSC to HYB */
1313       cusparseHybMat_t hybMat;
1314       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1315       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1318                               matstructT->descr, tempT->values->data().get(),
1319                               tempT->row_offsets->data().get(),
1320                               tempT->column_indices->data().get(),
1321                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1322 
1323       /* assign the pointer */
1324       matstructT->mat = hybMat;
1325       A->transupdated = PETSC_TRUE;
1326       /* delete temporaries */
1327       if (tempT) {
1328         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1329         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1330         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1331         delete (CsrMatrix*) tempT;
1332       }
1333       if (temp) {
1334         if (temp->values) delete (THRUSTARRAY*) temp->values;
1335         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1336         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1337         delete (CsrMatrix*) temp;
1338       }
1339      #endif
1340     }
1341   }
1342   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1343     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1344     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1345     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1346     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1347     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1348     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1349     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1350     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1351     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1352     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1353     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1354       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1355       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1356       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1357     }
1358     if (!cusparsestruct->csr2csc_i) {
1359       THRUSTARRAY csr2csc_a(matrix->num_entries);
1360       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1361 
1362       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1363      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1364       void   *csr2cscBuffer;
1365       size_t csr2cscBufferSize;
1366       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1367                                            A->cmap->n, matrix->num_entries,
1368                                            matrix->values->data().get(),
1369                                            cusparsestruct->rowoffsets_gpu->data().get(),
1370                                            matrix->column_indices->data().get(),
1371                                            matrixT->values->data().get(),
1372                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1373                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1374                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1375       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1376      #endif
1377 
1378       if (matrix->num_entries) {
1379         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1380            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1381            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1382 
1383            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1384            should be filled with indexBase. So I just take a shortcut here.
1385         */
1386         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1387                               A->cmap->n,matrix->num_entries,
1388                               csr2csc_a.data().get(),
1389                               cusparsestruct->rowoffsets_gpu->data().get(),
1390                               matrix->column_indices->data().get(),
1391                               matrixT->values->data().get(),
1392                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1393                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1394                               CUSPARSE_ACTION_NUMERIC,indexBase,
1395                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1396                              #else
1397                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1398                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1399                              #endif
1400       } else {
1401         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1402       }
1403 
1404       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1405       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1406      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407       PetscCallCUDA(cudaFree(csr2cscBuffer));
1408      #endif
1409     }
1410     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1411                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1412                                                      matrixT->values->begin()));
1413   }
1414   PetscCall(PetscLogGpuTimeEnd());
1415   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1416   /* the compressed row indices is not used for matTranspose */
1417   matstructT->cprowIndices = NULL;
1418   /* assign the pointer */
1419   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1420   A->transupdated = PETSC_TRUE;
1421   PetscFunctionReturn(0);
1422 }
1423 
1424 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1425 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1426 {
1427   PetscInt                              n = xx->map->n;
1428   const PetscScalar                     *barray;
1429   PetscScalar                           *xarray;
1430   thrust::device_ptr<const PetscScalar> bGPU;
1431   thrust::device_ptr<PetscScalar>       xGPU;
1432   cusparseStatus_t                      stat;
1433   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1434   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1435   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1436   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1437 
1438   PetscFunctionBegin;
1439   /* Analyze the matrix and create the transpose ... on the fly */
1440   if (!loTriFactorT && !upTriFactorT) {
1441     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1442     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1443     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1444   }
1445 
1446   /* Get the GPU pointers */
1447   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1448   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1449   xGPU = thrust::device_pointer_cast(xarray);
1450   bGPU = thrust::device_pointer_cast(barray);
1451 
1452   PetscCall(PetscLogGpuTimeBegin());
1453   /* First, reorder with the row permutation */
1454   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1455                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1456                xGPU);
1457 
1458   /* First, solve U */
1459   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1460                         upTriFactorT->csrMat->num_rows,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         upTriFactorT->csrMat->num_entries,
1463                       #endif
1464                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1465                         upTriFactorT->csrMat->values->data().get(),
1466                         upTriFactorT->csrMat->row_offsets->data().get(),
1467                         upTriFactorT->csrMat->column_indices->data().get(),
1468                         upTriFactorT->solveInfo,
1469                         xarray,
1470                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1471                         tempGPU->data().get(),
1472                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1473                       #else
1474                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1475                       #endif
1476 
1477   /* Then, solve L */
1478   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1479                         loTriFactorT->csrMat->num_rows,
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         loTriFactorT->csrMat->num_entries,
1482                       #endif
1483                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1484                         loTriFactorT->csrMat->values->data().get(),
1485                         loTriFactorT->csrMat->row_offsets->data().get(),
1486                         loTriFactorT->csrMat->column_indices->data().get(),
1487                         loTriFactorT->solveInfo,
1488                         tempGPU->data().get(),
1489                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1490                         xarray,
1491                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1492                       #else
1493                          xarray);PetscCallCUSPARSE(stat);
1494                       #endif
1495 
1496   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1497   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1498                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1499                tempGPU->begin());
1500 
1501   /* Copy the temporary to the full solution. */
1502   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1503 
1504   /* restore */
1505   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1506   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1507   PetscCall(PetscLogGpuTimeEnd());
1508   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1513 {
1514   const PetscScalar                 *barray;
1515   PetscScalar                       *xarray;
1516   cusparseStatus_t                  stat;
1517   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1518   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1521 
1522   PetscFunctionBegin;
1523   /* Analyze the matrix and create the transpose ... on the fly */
1524   if (!loTriFactorT && !upTriFactorT) {
1525     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1526     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1527     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1528   }
1529 
1530   /* Get the GPU pointers */
1531   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1532   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1533 
1534   PetscCall(PetscLogGpuTimeBegin());
1535   /* First, solve U */
1536   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1537                         upTriFactorT->csrMat->num_rows,
1538                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539                         upTriFactorT->csrMat->num_entries,
1540                       #endif
1541                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1542                         upTriFactorT->csrMat->values->data().get(),
1543                         upTriFactorT->csrMat->row_offsets->data().get(),
1544                         upTriFactorT->csrMat->column_indices->data().get(),
1545                         upTriFactorT->solveInfo,
1546                         barray,
1547                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548                         tempGPU->data().get(),
1549                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1550                       #else
1551                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1552                       #endif
1553 
1554   /* Then, solve L */
1555   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1556                         loTriFactorT->csrMat->num_rows,
1557                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1558                         loTriFactorT->csrMat->num_entries,
1559                       #endif
1560                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1561                         loTriFactorT->csrMat->values->data().get(),
1562                         loTriFactorT->csrMat->row_offsets->data().get(),
1563                         loTriFactorT->csrMat->column_indices->data().get(),
1564                         loTriFactorT->solveInfo,
1565                         tempGPU->data().get(),
1566                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1567                         xarray,
1568                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1569                       #else
1570                         xarray);PetscCallCUSPARSE(stat);
1571                       #endif
1572 
1573   /* restore */
1574   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1575   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1576   PetscCall(PetscLogGpuTimeEnd());
1577   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1582 {
1583   const PetscScalar                     *barray;
1584   PetscScalar                           *xarray;
1585   thrust::device_ptr<const PetscScalar> bGPU;
1586   thrust::device_ptr<PetscScalar>       xGPU;
1587   cusparseStatus_t                      stat;
1588   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1589   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1590   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1591   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1592 
1593   PetscFunctionBegin;
1594 
1595   /* Get the GPU pointers */
1596   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1597   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1598   xGPU = thrust::device_pointer_cast(xarray);
1599   bGPU = thrust::device_pointer_cast(barray);
1600 
1601   PetscCall(PetscLogGpuTimeBegin());
1602   /* First, reorder with the row permutation */
1603   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1604                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1605                tempGPU->begin());
1606 
1607   /* Next, solve L */
1608   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1609                         loTriFactor->csrMat->num_rows,
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         loTriFactor->csrMat->num_entries,
1612                       #endif
1613                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1614                         loTriFactor->csrMat->values->data().get(),
1615                         loTriFactor->csrMat->row_offsets->data().get(),
1616                         loTriFactor->csrMat->column_indices->data().get(),
1617                         loTriFactor->solveInfo,
1618                         tempGPU->data().get(),
1619                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620                          xarray,
1621                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1622                       #else
1623                          xarray);PetscCallCUSPARSE(stat);
1624                       #endif
1625 
1626   /* Then, solve U */
1627   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1628                         upTriFactor->csrMat->num_rows,
1629                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1630                         upTriFactor->csrMat->num_entries,
1631                       #endif
1632                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1633                         upTriFactor->csrMat->values->data().get(),
1634                         upTriFactor->csrMat->row_offsets->data().get(),
1635                         upTriFactor->csrMat->column_indices->data().get(),
1636                         upTriFactor->solveInfo,xarray,
1637                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1638                         tempGPU->data().get(),
1639                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1640                       #else
1641                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1642                       #endif
1643 
1644   /* Last, reorder with the column permutation */
1645   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1646                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1647                xGPU);
1648 
1649   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1650   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1651   PetscCall(PetscLogGpuTimeEnd());
1652   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1657 {
1658   const PetscScalar                 *barray;
1659   PetscScalar                       *xarray;
1660   cusparseStatus_t                  stat;
1661   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1662   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1663   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1664   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1665 
1666   PetscFunctionBegin;
1667   /* Get the GPU pointers */
1668   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1669   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1670 
1671   PetscCall(PetscLogGpuTimeBegin());
1672   /* First, solve L */
1673   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1674                         loTriFactor->csrMat->num_rows,
1675                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676                         loTriFactor->csrMat->num_entries,
1677                       #endif
1678                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1679                         loTriFactor->csrMat->values->data().get(),
1680                         loTriFactor->csrMat->row_offsets->data().get(),
1681                         loTriFactor->csrMat->column_indices->data().get(),
1682                         loTriFactor->solveInfo,
1683                         barray,
1684                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1685                         tempGPU->data().get(),
1686                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1687                       #else
1688                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1689                       #endif
1690 
1691   /* Next, solve U */
1692   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1693                         upTriFactor->csrMat->num_rows,
1694                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1695                         upTriFactor->csrMat->num_entries,
1696                       #endif
1697                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1698                         upTriFactor->csrMat->values->data().get(),
1699                         upTriFactor->csrMat->row_offsets->data().get(),
1700                         upTriFactor->csrMat->column_indices->data().get(),
1701                         upTriFactor->solveInfo,
1702                         tempGPU->data().get(),
1703                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1704                         xarray,
1705                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1706                       #else
1707                         xarray);PetscCallCUSPARSE(stat);
1708                       #endif
1709 
1710   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1711   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1712   PetscCall(PetscLogGpuTimeEnd());
1713   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1718 {
1719   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1720   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1721 
1722   PetscFunctionBegin;
1723   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1724     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1725 
1726     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1727     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1728     PetscCallCUDA(WaitForCUDA());
1729     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1730     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1731     A->offloadmask = PETSC_OFFLOAD_BOTH;
1732   }
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1737 {
1738   PetscFunctionBegin;
1739   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1740   *array = ((Mat_SeqAIJ*)A->data)->a;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1745 {
1746   PetscFunctionBegin;
1747   A->offloadmask = PETSC_OFFLOAD_CPU;
1748   *array         = NULL;
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1753 {
1754   PetscFunctionBegin;
1755   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1756   *array = ((Mat_SeqAIJ*)A->data)->a;
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1761 {
1762   PetscFunctionBegin;
1763   *array = NULL;
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1768 {
1769   PetscFunctionBegin;
1770   *array = ((Mat_SeqAIJ*)A->data)->a;
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1775 {
1776   PetscFunctionBegin;
1777   A->offloadmask = PETSC_OFFLOAD_CPU;
1778   *array         = NULL;
1779   PetscFunctionReturn(0);
1780 }
1781 
1782 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1783 {
1784   Mat_SeqAIJCUSPARSE           *cusp;
1785   CsrMatrix                    *matrix;
1786 
1787   PetscFunctionBegin;
1788   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1789   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1790   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1791   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
1792   matrix = (CsrMatrix*)cusp->mat->mat;
1793 
1794   if (i) {
1795    #if !defined(PETSC_USE_64BIT_INDICES)
1796     *i = matrix->row_offsets->data().get();
1797    #else
1798     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1799    #endif
1800   }
1801   if (j) {
1802    #if !defined(PETSC_USE_64BIT_INDICES)
1803     *j = matrix->column_indices->data().get();
1804    #else
1805     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1806    #endif
1807   }
1808   if (a) *a = matrix->values->data().get();
1809   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1814 {
1815   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1816   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1817   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1818   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1819   cusparseStatus_t             stat;
1820   PetscBool                    both = PETSC_TRUE;
1821 
1822   PetscFunctionBegin;
1823   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1824   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1825     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1826       CsrMatrix *matrix;
1827       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1828 
1829       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1830       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1831       matrix->values->assign(a->a, a->a+a->nz);
1832       PetscCallCUDA(WaitForCUDA());
1833       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1834       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1835       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1836     } else {
1837       PetscInt nnz;
1838       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1839       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1840       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1841       delete cusparsestruct->workVector;
1842       delete cusparsestruct->rowoffsets_gpu;
1843       cusparsestruct->workVector = NULL;
1844       cusparsestruct->rowoffsets_gpu = NULL;
1845       try {
1846         if (a->compressedrow.use) {
1847           m    = a->compressedrow.nrows;
1848           ii   = a->compressedrow.i;
1849           ridx = a->compressedrow.rindex;
1850         } else {
1851           m    = A->rmap->n;
1852           ii   = a->i;
1853           ridx = NULL;
1854         }
1855         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1856         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1857         else nnz = a->nz;
1858         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1859 
1860         /* create cusparse matrix */
1861         cusparsestruct->nrows = m;
1862         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1863         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1864         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1865         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1866 
1867         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1868         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1869         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1870         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1871         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1872         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1873         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1874 
1875         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1876         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1877           /* set the matrix */
1878           CsrMatrix *mat= new CsrMatrix;
1879           mat->num_rows = m;
1880           mat->num_cols = A->cmap->n;
1881           mat->num_entries = nnz;
1882           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1883           mat->row_offsets->assign(ii, ii + m+1);
1884 
1885           mat->column_indices = new THRUSTINTARRAY32(nnz);
1886           mat->column_indices->assign(a->j, a->j+nnz);
1887 
1888           mat->values = new THRUSTARRAY(nnz);
1889           if (a->a) mat->values->assign(a->a, a->a+nnz);
1890 
1891           /* assign the pointer */
1892           matstruct->mat = mat;
1893          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1894           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1895             stat = cusparseCreateCsr(&matstruct->matDescr,
1896                                     mat->num_rows, mat->num_cols, mat->num_entries,
1897                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1898                                     mat->values->data().get(),
1899                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1900                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1901           }
1902          #endif
1903         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1904          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1905           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1906          #else
1907           CsrMatrix *mat= new CsrMatrix;
1908           mat->num_rows = m;
1909           mat->num_cols = A->cmap->n;
1910           mat->num_entries = nnz;
1911           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1912           mat->row_offsets->assign(ii, ii + m+1);
1913 
1914           mat->column_indices = new THRUSTINTARRAY32(nnz);
1915           mat->column_indices->assign(a->j, a->j+nnz);
1916 
1917           mat->values = new THRUSTARRAY(nnz);
1918           if (a->a) mat->values->assign(a->a, a->a+nnz);
1919 
1920           cusparseHybMat_t hybMat;
1921           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1922           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1923             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1924           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1925               matstruct->descr, mat->values->data().get(),
1926               mat->row_offsets->data().get(),
1927               mat->column_indices->data().get(),
1928               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1929           /* assign the pointer */
1930           matstruct->mat = hybMat;
1931 
1932           if (mat) {
1933             if (mat->values) delete (THRUSTARRAY*)mat->values;
1934             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1935             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1936             delete (CsrMatrix*)mat;
1937           }
1938          #endif
1939         }
1940 
1941         /* assign the compressed row indices */
1942         if (a->compressedrow.use) {
1943           cusparsestruct->workVector = new THRUSTARRAY(m);
1944           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1945           matstruct->cprowIndices->assign(ridx,ridx+m);
1946           tmp = m;
1947         } else {
1948           cusparsestruct->workVector = NULL;
1949           matstruct->cprowIndices    = NULL;
1950           tmp = 0;
1951         }
1952         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1953 
1954         /* assign the pointer */
1955         cusparsestruct->mat = matstruct;
1956       } catch(char *ex) {
1957         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1958       }
1959       PetscCallCUDA(WaitForCUDA());
1960       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1961       cusparsestruct->nonzerostate = A->nonzerostate;
1962     }
1963     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1964   }
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 struct VecCUDAPlusEquals
1969 {
1970   template <typename Tuple>
1971   __host__ __device__
1972   void operator()(Tuple t)
1973   {
1974     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1975   }
1976 };
1977 
1978 struct VecCUDAEquals
1979 {
1980   template <typename Tuple>
1981   __host__ __device__
1982   void operator()(Tuple t)
1983   {
1984     thrust::get<1>(t) = thrust::get<0>(t);
1985   }
1986 };
1987 
1988 struct VecCUDAEqualsReverse
1989 {
1990   template <typename Tuple>
1991   __host__ __device__
1992   void operator()(Tuple t)
1993   {
1994     thrust::get<0>(t) = thrust::get<1>(t);
1995   }
1996 };
1997 
1998 struct MatMatCusparse {
1999   PetscBool             cisdense;
2000   PetscScalar           *Bt;
2001   Mat                   X;
2002   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2003   PetscLogDouble        flops;
2004   CsrMatrix             *Bcsr;
2005 
2006 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2007   cusparseSpMatDescr_t  matSpBDescr;
2008   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2009   cusparseDnMatDescr_t  matBDescr;
2010   cusparseDnMatDescr_t  matCDescr;
2011   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2012  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2013   void                  *dBuffer4;
2014   void                  *dBuffer5;
2015  #endif
2016   size_t                mmBufferSize;
2017   void                  *mmBuffer;
2018   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2019   cusparseSpGEMMDescr_t spgemmDesc;
2020 #endif
2021 };
2022 
2023 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2024 {
2025   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2026 
2027   PetscFunctionBegin;
2028   PetscCallCUDA(cudaFree(mmdata->Bt));
2029   delete mmdata->Bcsr;
2030  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2031   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2032   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2033   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2034   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2035  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2036   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2037   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2038  #endif
2039   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2040   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2041  #endif
2042   PetscCall(MatDestroy(&mmdata->X));
2043   PetscCall(PetscFree(data));
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2048 
2049 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2050 {
2051   Mat_Product                  *product = C->product;
2052   Mat                          A,B;
2053   PetscInt                     m,n,blda,clda;
2054   PetscBool                    flg,biscuda;
2055   Mat_SeqAIJCUSPARSE           *cusp;
2056   cusparseStatus_t             stat;
2057   cusparseOperation_t          opA;
2058   const PetscScalar            *barray;
2059   PetscScalar                  *carray;
2060   MatMatCusparse               *mmdata;
2061   Mat_SeqAIJCUSPARSEMultStruct *mat;
2062   CsrMatrix                    *csrmat;
2063 
2064   PetscFunctionBegin;
2065   MatCheckProduct(C,1);
2066   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2067   mmdata = (MatMatCusparse*)product->data;
2068   A    = product->A;
2069   B    = product->B;
2070   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2071   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2072   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2073      Instead of silently accepting the wrong answer, I prefer to raise the error */
2074   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2075   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2076   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077   switch (product->type) {
2078   case MATPRODUCT_AB:
2079   case MATPRODUCT_PtAP:
2080     mat = cusp->mat;
2081     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2082     m   = A->rmap->n;
2083     n   = B->cmap->n;
2084     break;
2085   case MATPRODUCT_AtB:
2086     if (!A->form_explicit_transpose) {
2087       mat = cusp->mat;
2088       opA = CUSPARSE_OPERATION_TRANSPOSE;
2089     } else {
2090       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2091       mat  = cusp->matTranspose;
2092       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2093     }
2094     m = A->cmap->n;
2095     n = B->cmap->n;
2096     break;
2097   case MATPRODUCT_ABt:
2098   case MATPRODUCT_RARt:
2099     mat = cusp->mat;
2100     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2101     m   = A->rmap->n;
2102     n   = B->rmap->n;
2103     break;
2104   default:
2105     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2106   }
2107   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2108   csrmat = (CsrMatrix*)mat->mat;
2109   /* if the user passed a CPU matrix, copy the data to the GPU */
2110   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2111   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2112   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2113 
2114   PetscCall(MatDenseGetLDA(B,&blda));
2115   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2116     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2117     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2118   } else {
2119     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2120     PetscCall(MatDenseGetLDA(C,&clda));
2121   }
2122 
2123   PetscCall(PetscLogGpuTimeBegin());
2124  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2126   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2127   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2128     size_t mmBufferSize;
2129     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2130     if (!mmdata->matBDescr) {
2131       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2132       mmdata->Blda = blda;
2133     }
2134 
2135     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2136     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2137       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2138       mmdata->Clda = clda;
2139     }
2140 
2141     if (!mat->matDescr) {
2142       stat = cusparseCreateCsr(&mat->matDescr,
2143                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2144                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2145                                csrmat->values->data().get(),
2146                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2147                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2148     }
2149     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2150                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2151                                    mmdata->matCDescr,cusparse_scalartype,
2152                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2153     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2154       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2155       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2156       mmdata->mmBufferSize = mmBufferSize;
2157     }
2158     mmdata->initialized = PETSC_TRUE;
2159   } else {
2160     /* to be safe, always update pointers of the mats */
2161     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2162     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2163     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2164   }
2165 
2166   /* do cusparseSpMM, which supports transpose on B */
2167   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2168                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2169                       mmdata->matCDescr,cusparse_scalartype,
2170                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2171  #else
2172   PetscInt k;
2173   /* cusparseXcsrmm does not support transpose on B */
2174   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2175     cublasHandle_t cublasv2handle;
2176     cublasStatus_t cerr;
2177 
2178     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2179     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2180                        B->cmap->n,B->rmap->n,
2181                        &PETSC_CUSPARSE_ONE ,barray,blda,
2182                        &PETSC_CUSPARSE_ZERO,barray,blda,
2183                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2184     blda = B->cmap->n;
2185     k    = B->cmap->n;
2186   } else {
2187     k    = B->rmap->n;
2188   }
2189 
2190   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2191   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2192                            csrmat->num_entries,mat->alpha_one,mat->descr,
2193                            csrmat->values->data().get(),
2194                            csrmat->row_offsets->data().get(),
2195                            csrmat->column_indices->data().get(),
2196                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2197                            carray,clda);PetscCallCUSPARSE(stat);
2198  #endif
2199   PetscCall(PetscLogGpuTimeEnd());
2200   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2201   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2202   if (product->type == MATPRODUCT_RARt) {
2203     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2204     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2205   } else if (product->type == MATPRODUCT_PtAP) {
2206     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2207     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2208   } else {
2209     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2210   }
2211   if (mmdata->cisdense) {
2212     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2213   }
2214   if (!biscuda) {
2215     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2216   }
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2221 {
2222   Mat_Product        *product = C->product;
2223   Mat                A,B;
2224   PetscInt           m,n;
2225   PetscBool          cisdense,flg;
2226   MatMatCusparse     *mmdata;
2227   Mat_SeqAIJCUSPARSE *cusp;
2228 
2229   PetscFunctionBegin;
2230   MatCheckProduct(C,1);
2231   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2232   A    = product->A;
2233   B    = product->B;
2234   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2235   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2236   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2237   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2238   switch (product->type) {
2239   case MATPRODUCT_AB:
2240     m = A->rmap->n;
2241     n = B->cmap->n;
2242     break;
2243   case MATPRODUCT_AtB:
2244     m = A->cmap->n;
2245     n = B->cmap->n;
2246     break;
2247   case MATPRODUCT_ABt:
2248     m = A->rmap->n;
2249     n = B->rmap->n;
2250     break;
2251   case MATPRODUCT_PtAP:
2252     m = B->cmap->n;
2253     n = B->cmap->n;
2254     break;
2255   case MATPRODUCT_RARt:
2256     m = B->rmap->n;
2257     n = B->rmap->n;
2258     break;
2259   default:
2260     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2261   }
2262   PetscCall(MatSetSizes(C,m,n,m,n));
2263   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2264   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2265   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2266 
2267   /* product data */
2268   PetscCall(PetscNew(&mmdata));
2269   mmdata->cisdense = cisdense;
2270  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2271   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2272   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2273     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2274   }
2275  #endif
2276   /* for these products we need intermediate storage */
2277   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2278     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2279     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2280     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2281       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2282     } else {
2283       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2284     }
2285   }
2286   C->product->data    = mmdata;
2287   C->product->destroy = MatDestroy_MatMatCusparse;
2288 
2289   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2290   PetscFunctionReturn(0);
2291 }
2292 
2293 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2294 {
2295   Mat_Product                  *product = C->product;
2296   Mat                          A,B;
2297   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2298   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2299   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2300   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2301   PetscBool                    flg;
2302   cusparseStatus_t             stat;
2303   MatProductType               ptype;
2304   MatMatCusparse               *mmdata;
2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306   cusparseSpMatDescr_t         BmatSpDescr;
2307 #endif
2308   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2309 
2310   PetscFunctionBegin;
2311   MatCheckProduct(C,1);
2312   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2313   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2314   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2315   mmdata = (MatMatCusparse*)C->product->data;
2316   A = product->A;
2317   B = product->B;
2318   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2319     mmdata->reusesym = PETSC_FALSE;
2320     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2321     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2322     Cmat = Ccusp->mat;
2323     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2324     Ccsr = (CsrMatrix*)Cmat->mat;
2325     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2326     goto finalize;
2327   }
2328   if (!c->nz) goto finalize;
2329   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2330   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2331   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2332   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2333   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2334   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2335   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2336   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2337   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2338   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2339   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2340   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2341   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2342   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2343 
2344   ptype = product->type;
2345   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2346     ptype = MATPRODUCT_AB;
2347     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2348   }
2349   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2350     ptype = MATPRODUCT_AB;
2351     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2352   }
2353   switch (ptype) {
2354   case MATPRODUCT_AB:
2355     Amat = Acusp->mat;
2356     Bmat = Bcusp->mat;
2357     break;
2358   case MATPRODUCT_AtB:
2359     Amat = Acusp->matTranspose;
2360     Bmat = Bcusp->mat;
2361     break;
2362   case MATPRODUCT_ABt:
2363     Amat = Acusp->mat;
2364     Bmat = Bcusp->matTranspose;
2365     break;
2366   default:
2367     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2368   }
2369   Cmat = Ccusp->mat;
2370   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2371   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2372   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2373   Acsr = (CsrMatrix*)Amat->mat;
2374   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2375   Ccsr = (CsrMatrix*)Cmat->mat;
2376   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2377   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2378   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2379   PetscCall(PetscLogGpuTimeBegin());
2380 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2381   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2382   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2383   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2384     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2385                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2386                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2387                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2388   #else
2389     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2390                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2391                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2392                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2393     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2394                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2395                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2396   #endif
2397 #else
2398   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2399                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2400                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2401                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2402                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2403 #endif
2404   PetscCall(PetscLogGpuFlops(mmdata->flops));
2405   PetscCallCUDA(WaitForCUDA());
2406   PetscCall(PetscLogGpuTimeEnd());
2407   C->offloadmask = PETSC_OFFLOAD_GPU;
2408 finalize:
2409   /* shorter version of MatAssemblyEnd_SeqAIJ */
2410   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2411   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2412   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2413   c->reallocs         = 0;
2414   C->info.mallocs    += 0;
2415   C->info.nz_unneeded = 0;
2416   C->assembled = C->was_assembled = PETSC_TRUE;
2417   C->num_ass++;
2418   PetscFunctionReturn(0);
2419 }
2420 
2421 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2422 {
2423   Mat_Product                  *product = C->product;
2424   Mat                          A,B;
2425   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2426   Mat_SeqAIJ                   *a,*b,*c;
2427   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2428   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2429   PetscInt                     i,j,m,n,k;
2430   PetscBool                    flg;
2431   cusparseStatus_t             stat;
2432   MatProductType               ptype;
2433   MatMatCusparse               *mmdata;
2434   PetscLogDouble               flops;
2435   PetscBool                    biscompressed,ciscompressed;
2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2438   cusparseSpMatDescr_t         BmatSpDescr;
2439 #else
2440   int                          cnz;
2441 #endif
2442   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2443 
2444   PetscFunctionBegin;
2445   MatCheckProduct(C,1);
2446   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2447   A    = product->A;
2448   B    = product->B;
2449   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2450   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2451   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2452   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2453   a = (Mat_SeqAIJ*)A->data;
2454   b = (Mat_SeqAIJ*)B->data;
2455   /* product data */
2456   PetscCall(PetscNew(&mmdata));
2457   C->product->data    = mmdata;
2458   C->product->destroy = MatDestroy_MatMatCusparse;
2459 
2460   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2461   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2462   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2463   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2464   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2465   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2466 
2467   ptype = product->type;
2468   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2469     ptype = MATPRODUCT_AB;
2470     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2471   }
2472   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2473     ptype = MATPRODUCT_AB;
2474     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2475   }
2476   biscompressed = PETSC_FALSE;
2477   ciscompressed = PETSC_FALSE;
2478   switch (ptype) {
2479   case MATPRODUCT_AB:
2480     m = A->rmap->n;
2481     n = B->cmap->n;
2482     k = A->cmap->n;
2483     Amat = Acusp->mat;
2484     Bmat = Bcusp->mat;
2485     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2486     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2487     break;
2488   case MATPRODUCT_AtB:
2489     m = A->cmap->n;
2490     n = B->cmap->n;
2491     k = A->rmap->n;
2492     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2493     Amat = Acusp->matTranspose;
2494     Bmat = Bcusp->mat;
2495     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2496     break;
2497   case MATPRODUCT_ABt:
2498     m = A->rmap->n;
2499     n = B->rmap->n;
2500     k = A->cmap->n;
2501     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2502     Amat = Acusp->mat;
2503     Bmat = Bcusp->matTranspose;
2504     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2505     break;
2506   default:
2507     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2508   }
2509 
2510   /* create cusparse matrix */
2511   PetscCall(MatSetSizes(C,m,n,m,n));
2512   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2513   c     = (Mat_SeqAIJ*)C->data;
2514   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2515   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2516   Ccsr  = new CsrMatrix;
2517 
2518   c->compressedrow.use = ciscompressed;
2519   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2520     c->compressedrow.nrows = a->compressedrow.nrows;
2521     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2522     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2523     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2524     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2525     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2526   } else {
2527     c->compressedrow.nrows  = 0;
2528     c->compressedrow.i      = NULL;
2529     c->compressedrow.rindex = NULL;
2530     Ccusp->workVector       = NULL;
2531     Cmat->cprowIndices      = NULL;
2532   }
2533   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2534   Ccusp->mat      = Cmat;
2535   Ccusp->mat->mat = Ccsr;
2536   Ccsr->num_rows    = Ccusp->nrows;
2537   Ccsr->num_cols    = n;
2538   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2539   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2540   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2541   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2542   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2543   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2544   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2545   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2546   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2547   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2548   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2549     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2550     c->nz = 0;
2551     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2552     Ccsr->values = new THRUSTARRAY(c->nz);
2553     goto finalizesym;
2554   }
2555 
2556   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2557   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2558   Acsr = (CsrMatrix*)Amat->mat;
2559   if (!biscompressed) {
2560     Bcsr = (CsrMatrix*)Bmat->mat;
2561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2562     BmatSpDescr = Bmat->matDescr;
2563 #endif
2564   } else { /* we need to use row offsets for the full matrix */
2565     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2566     Bcsr = new CsrMatrix;
2567     Bcsr->num_rows       = B->rmap->n;
2568     Bcsr->num_cols       = cBcsr->num_cols;
2569     Bcsr->num_entries    = cBcsr->num_entries;
2570     Bcsr->column_indices = cBcsr->column_indices;
2571     Bcsr->values         = cBcsr->values;
2572     if (!Bcusp->rowoffsets_gpu) {
2573       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2574       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2575       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2576     }
2577     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2578     mmdata->Bcsr = Bcsr;
2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2580     if (Bcsr->num_rows && Bcsr->num_cols) {
2581       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2582                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2583                                Bcsr->values->data().get(),
2584                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2585                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2586     }
2587     BmatSpDescr = mmdata->matSpBDescr;
2588 #endif
2589   }
2590   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2591   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2592   /* precompute flops count */
2593   if (ptype == MATPRODUCT_AB) {
2594     for (i=0, flops = 0; i<A->rmap->n; i++) {
2595       const PetscInt st = a->i[i];
2596       const PetscInt en = a->i[i+1];
2597       for (j=st; j<en; j++) {
2598         const PetscInt brow = a->j[j];
2599         flops += 2.*(b->i[brow+1] - b->i[brow]);
2600       }
2601     }
2602   } else if (ptype == MATPRODUCT_AtB) {
2603     for (i=0, flops = 0; i<A->rmap->n; i++) {
2604       const PetscInt anzi = a->i[i+1] - a->i[i];
2605       const PetscInt bnzi = b->i[i+1] - b->i[i];
2606       flops += (2.*anzi)*bnzi;
2607     }
2608   } else { /* TODO */
2609     flops = 0.;
2610   }
2611 
2612   mmdata->flops = flops;
2613   PetscCall(PetscLogGpuTimeBegin());
2614 
2615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2616   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2617   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2618                           NULL, NULL, NULL,
2619                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2620                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2621   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2622  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2623  {
2624   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2625      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2626   */
2627   void*  dBuffer1 = NULL;
2628   void*  dBuffer2 = NULL;
2629   void*  dBuffer3 = NULL;
2630   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2631   size_t bufferSize1 = 0;
2632   size_t bufferSize2 = 0;
2633   size_t bufferSize3 = 0;
2634   size_t bufferSize4 = 0;
2635   size_t bufferSize5 = 0;
2636 
2637   /*----------------------------------------------------------------------*/
2638   /* ask bufferSize1 bytes for external memory */
2639   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2640                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2641                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2642   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2643   /* inspect the matrices A and B to understand the memory requirement for the next step */
2644   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2645                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2646                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2647 
2648   /*----------------------------------------------------------------------*/
2649   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2650                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2651                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2652   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2653   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2654   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2655   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2656                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2657                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2658   PetscCallCUDA(cudaFree(dBuffer1));
2659   PetscCallCUDA(cudaFree(dBuffer2));
2660 
2661   /*----------------------------------------------------------------------*/
2662   /* get matrix C non-zero entries C_nnz1 */
2663   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2664   c->nz = (PetscInt) C_nnz1;
2665   /* allocate matrix C */
2666   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2667   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2668   /* update matC with the new pointers */
2669   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2670                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2671 
2672   /*----------------------------------------------------------------------*/
2673   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2674                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2675                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2676   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2677   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2678                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2679                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2680   PetscCallCUDA(cudaFree(dBuffer3));
2681   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2682                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2685   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2686  }
2687  #else
2688   size_t bufSize2;
2689   /* ask bufferSize bytes for external memory */
2690   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2691                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2692                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2693                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2694   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2695   /* inspect the matrices A and B to understand the memory requirement for the next step */
2696   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2697                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2698                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2699                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2700   /* ask bufferSize again bytes for external memory */
2701   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2702                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2703                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2704                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2705   /* The CUSPARSE documentation is not clear, nor the API
2706      We need both buffers to perform the operations properly!
2707      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2708      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2709      is stored in the descriptor! What a messy API... */
2710   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2711   /* compute the intermediate product of A * B */
2712   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2713                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2714                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2715                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2716   /* get matrix C non-zero entries C_nnz1 */
2717   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2718   c->nz = (PetscInt) C_nnz1;
2719   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2720   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2721   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2722   Ccsr->values = new THRUSTARRAY(c->nz);
2723   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2724   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2725                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2726   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2727                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2728                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2729  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2730 #else
2731   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2732   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2733                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2734                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2735                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2736                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2737   c->nz = cnz;
2738   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2739   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2740   Ccsr->values = new THRUSTARRAY(c->nz);
2741   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2742 
2743   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2744   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2745      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2746      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2747   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2748                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2749                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2750                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2751                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2752 #endif
2753   PetscCall(PetscLogGpuFlops(mmdata->flops));
2754   PetscCall(PetscLogGpuTimeEnd());
2755 finalizesym:
2756   c->singlemalloc = PETSC_FALSE;
2757   c->free_a       = PETSC_TRUE;
2758   c->free_ij      = PETSC_TRUE;
2759   PetscCall(PetscMalloc1(m+1,&c->i));
2760   PetscCall(PetscMalloc1(c->nz,&c->j));
2761   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2762     PetscInt *d_i = c->i;
2763     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2764     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2765     ii   = *Ccsr->row_offsets;
2766     jj   = *Ccsr->column_indices;
2767     if (ciscompressed) d_i = c->compressedrow.i;
2768     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2769     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2770   } else {
2771     PetscInt *d_i = c->i;
2772     if (ciscompressed) d_i = c->compressedrow.i;
2773     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2774     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2775   }
2776   if (ciscompressed) { /* need to expand host row offsets */
2777     PetscInt r = 0;
2778     c->i[0] = 0;
2779     for (k = 0; k < c->compressedrow.nrows; k++) {
2780       const PetscInt next = c->compressedrow.rindex[k];
2781       const PetscInt old = c->compressedrow.i[k];
2782       for (; r < next; r++) c->i[r+1] = old;
2783     }
2784     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2785   }
2786   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2787   PetscCall(PetscMalloc1(m,&c->ilen));
2788   PetscCall(PetscMalloc1(m,&c->imax));
2789   c->maxnz = c->nz;
2790   c->nonzerorowcnt = 0;
2791   c->rmax = 0;
2792   for (k = 0; k < m; k++) {
2793     const PetscInt nn = c->i[k+1] - c->i[k];
2794     c->ilen[k] = c->imax[k] = nn;
2795     c->nonzerorowcnt += (PetscInt)!!nn;
2796     c->rmax = PetscMax(c->rmax,nn);
2797   }
2798   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2799   PetscCall(PetscMalloc1(c->nz,&c->a));
2800   Ccsr->num_entries = c->nz;
2801 
2802   C->nonzerostate++;
2803   PetscCall(PetscLayoutSetUp(C->rmap));
2804   PetscCall(PetscLayoutSetUp(C->cmap));
2805   Ccusp->nonzerostate = C->nonzerostate;
2806   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2807   C->preallocated  = PETSC_TRUE;
2808   C->assembled     = PETSC_FALSE;
2809   C->was_assembled = PETSC_FALSE;
2810   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2811     mmdata->reusesym = PETSC_TRUE;
2812     C->offloadmask   = PETSC_OFFLOAD_GPU;
2813   }
2814   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2815   PetscFunctionReturn(0);
2816 }
2817 
2818 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2819 
2820 /* handles sparse or dense B */
2821 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2822 {
2823   Mat_Product    *product = mat->product;
2824   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2825 
2826   PetscFunctionBegin;
2827   MatCheckProduct(mat,1);
2828   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2829   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2830     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2831   }
2832   if (product->type == MATPRODUCT_ABC) {
2833     Ciscusp = PETSC_FALSE;
2834     if (!product->C->boundtocpu) {
2835       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2836     }
2837   }
2838   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2839     PetscBool usecpu = PETSC_FALSE;
2840     switch (product->type) {
2841     case MATPRODUCT_AB:
2842       if (product->api_user) {
2843         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2844         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2845         PetscOptionsEnd();
2846       } else {
2847         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2848         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2849         PetscOptionsEnd();
2850       }
2851       break;
2852     case MATPRODUCT_AtB:
2853       if (product->api_user) {
2854         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2855         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2856         PetscOptionsEnd();
2857       } else {
2858         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2859         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2860         PetscOptionsEnd();
2861       }
2862       break;
2863     case MATPRODUCT_PtAP:
2864       if (product->api_user) {
2865         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2866         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2867         PetscOptionsEnd();
2868       } else {
2869         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2870         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2871         PetscOptionsEnd();
2872       }
2873       break;
2874     case MATPRODUCT_RARt:
2875       if (product->api_user) {
2876         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2877         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2878         PetscOptionsEnd();
2879       } else {
2880         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2881         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2882         PetscOptionsEnd();
2883       }
2884       break;
2885     case MATPRODUCT_ABC:
2886       if (product->api_user) {
2887         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2888         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2889         PetscOptionsEnd();
2890       } else {
2891         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2892         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2893         PetscOptionsEnd();
2894       }
2895       break;
2896     default:
2897       break;
2898     }
2899     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2900   }
2901   /* dispatch */
2902   if (isdense) {
2903     switch (product->type) {
2904     case MATPRODUCT_AB:
2905     case MATPRODUCT_AtB:
2906     case MATPRODUCT_ABt:
2907     case MATPRODUCT_PtAP:
2908     case MATPRODUCT_RARt:
2909      if (product->A->boundtocpu) {
2910         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2911       } else {
2912         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2913       }
2914       break;
2915     case MATPRODUCT_ABC:
2916       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2917       break;
2918     default:
2919       break;
2920     }
2921   } else if (Biscusp && Ciscusp) {
2922     switch (product->type) {
2923     case MATPRODUCT_AB:
2924     case MATPRODUCT_AtB:
2925     case MATPRODUCT_ABt:
2926       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2927       break;
2928     case MATPRODUCT_PtAP:
2929     case MATPRODUCT_RARt:
2930     case MATPRODUCT_ABC:
2931       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2932       break;
2933     default:
2934       break;
2935     }
2936   } else { /* fallback for AIJ */
2937     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2938   }
2939   PetscFunctionReturn(0);
2940 }
2941 
2942 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2943 {
2944   PetscFunctionBegin;
2945   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2946   PetscFunctionReturn(0);
2947 }
2948 
2949 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2950 {
2951   PetscFunctionBegin;
2952   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2953   PetscFunctionReturn(0);
2954 }
2955 
2956 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2957 {
2958   PetscFunctionBegin;
2959   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2960   PetscFunctionReturn(0);
2961 }
2962 
2963 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2964 {
2965   PetscFunctionBegin;
2966   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2967   PetscFunctionReturn(0);
2968 }
2969 
2970 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2971 {
2972   PetscFunctionBegin;
2973   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2974   PetscFunctionReturn(0);
2975 }
2976 
2977 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2978 {
2979   int i = blockIdx.x*blockDim.x + threadIdx.x;
2980   if (i < n) y[idx[i]] += x[i];
2981 }
2982 
2983 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2984 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2985 {
2986   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2987   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2988   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2989   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2990   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2991   PetscBool                    compressed;
2992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2993   PetscInt                     nx,ny;
2994 #endif
2995 
2996   PetscFunctionBegin;
2997   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2998   if (!a->nz) {
2999     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3000     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3001     PetscFunctionReturn(0);
3002   }
3003   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3004   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3005   if (!trans) {
3006     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3007     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3008   } else {
3009     if (herm || !A->form_explicit_transpose) {
3010       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3011       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3012     } else {
3013       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3014       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3015     }
3016   }
3017   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3018   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3019 
3020   try {
3021     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3022     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3023     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3024 
3025     PetscCall(PetscLogGpuTimeBegin());
3026     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3027       /* z = A x + beta y.
3028          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3029          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3030       */
3031       xptr = xarray;
3032       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3033       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3034      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3035       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3036           allocated to accommodate different uses. So we get the length info directly from mat.
3037        */
3038       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3039         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3040         nx = mat->num_cols;
3041         ny = mat->num_rows;
3042       }
3043      #endif
3044     } else {
3045       /* z = A^T x + beta y
3046          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3047          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3048        */
3049       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3050       dptr = zarray;
3051       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3052       if (compressed) { /* Scatter x to work vector */
3053         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3054         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3055                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3056                          VecCUDAEqualsReverse());
3057       }
3058      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3059       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3060         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3061         nx = mat->num_rows;
3062         ny = mat->num_cols;
3063       }
3064      #endif
3065     }
3066 
3067     /* csr_spmv does y = alpha op(A) x + beta y */
3068     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3069      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3071       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3072         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3073         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3074         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3075                                                matstruct->matDescr,
3076                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3077                                                matstruct->cuSpMV[opA].vecYDescr,
3078                                                cusparse_scalartype,
3079                                                cusparsestruct->spmvAlg,
3080                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3081         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3082 
3083         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3084       } else {
3085         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3086         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3087         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3088       }
3089 
3090       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3091                                   matstruct->alpha_one,
3092                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3093                                   matstruct->cuSpMV[opA].vecXDescr,
3094                                   beta,
3095                                   matstruct->cuSpMV[opA].vecYDescr,
3096                                   cusparse_scalartype,
3097                                   cusparsestruct->spmvAlg,
3098                                   matstruct->cuSpMV[opA].spmvBuffer));
3099      #else
3100       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3101       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3102                                        mat->num_rows, mat->num_cols,
3103                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3104                                        mat->values->data().get(), mat->row_offsets->data().get(),
3105                                        mat->column_indices->data().get(), xptr, beta,
3106                                        dptr));
3107      #endif
3108     } else {
3109       if (cusparsestruct->nrows) {
3110        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3111         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3112        #else
3113         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3114         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3115                                          matstruct->alpha_one, matstruct->descr, hybMat,
3116                                          xptr, beta,
3117                                          dptr));
3118        #endif
3119       }
3120     }
3121     PetscCall(PetscLogGpuTimeEnd());
3122 
3123     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3124       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3125         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3126           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3127         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3128           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3129         }
3130       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3131         PetscCall(VecSet_SeqCUDA(zz,0));
3132       }
3133 
3134       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3135       if (compressed) {
3136         PetscCall(PetscLogGpuTimeBegin());
3137         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3138            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3139            prevent that. So I just add a ScatterAdd kernel.
3140          */
3141        #if 0
3142         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3143         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3144                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3145                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3146                          VecCUDAPlusEquals());
3147        #else
3148         PetscInt n = matstruct->cprowIndices->size();
3149         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3150        #endif
3151         PetscCall(PetscLogGpuTimeEnd());
3152       }
3153     } else {
3154       if (yy && yy != zz) {
3155         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3156       }
3157     }
3158     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3159     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3160     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3161   } catch(char *ex) {
3162     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3163   }
3164   if (yy) {
3165     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3166   } else {
3167     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3168   }
3169   PetscFunctionReturn(0);
3170 }
3171 
3172 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3173 {
3174   PetscFunctionBegin;
3175   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3176   PetscFunctionReturn(0);
3177 }
3178 
3179 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3180 {
3181   PetscObjectState   onnz = A->nonzerostate;
3182   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3183 
3184   PetscFunctionBegin;
3185   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3186   if (onnz != A->nonzerostate && cusp->deviceMat) {
3187 
3188     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3189     PetscCallCUDA(cudaFree(cusp->deviceMat));
3190     cusp->deviceMat = NULL;
3191   }
3192   PetscFunctionReturn(0);
3193 }
3194 
3195 /* --------------------------------------------------------------------------------*/
3196 /*@
3197    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3198    (the default parallel PETSc format). This matrix will ultimately pushed down
3199    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3200    assembly performance the user should preallocate the matrix storage by setting
3201    the parameter nz (or the array nnz).  By setting these parameters accurately,
3202    performance during matrix assembly can be increased by more than a factor of 50.
3203 
3204    Collective
3205 
3206    Input Parameters:
3207 +  comm - MPI communicator, set to PETSC_COMM_SELF
3208 .  m - number of rows
3209 .  n - number of columns
3210 .  nz - number of nonzeros per row (same for all rows)
3211 -  nnz - array containing the number of nonzeros in the various rows
3212          (possibly different for each row) or NULL
3213 
3214    Output Parameter:
3215 .  A - the matrix
3216 
3217    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3218    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3219    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3220 
3221    Notes:
3222    If nnz is given then nz is ignored
3223 
3224    The AIJ format (also called the Yale sparse matrix format or
3225    compressed row storage), is fully compatible with standard Fortran 77
3226    storage.  That is, the stored row and column indices can begin at
3227    either one (as in Fortran) or zero.  See the users' manual for details.
3228 
3229    Specify the preallocated storage with either nz or nnz (not both).
3230    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3231    allocation.  For large problems you MUST preallocate memory or you
3232    will get TERRIBLE performance, see the users' manual chapter on matrices.
3233 
3234    By default, this format uses inodes (identical nodes) when possible, to
3235    improve numerical efficiency of matrix-vector products and solves. We
3236    search for consecutive rows with the same nonzero structure, thereby
3237    reusing matrix information to achieve increased efficiency.
3238 
3239    Level: intermediate
3240 
3241 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3242 @*/
3243 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3244 {
3245   PetscFunctionBegin;
3246   PetscCall(MatCreate(comm,A));
3247   PetscCall(MatSetSizes(*A,m,n,m,n));
3248   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3249   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3254 {
3255   PetscFunctionBegin;
3256   if (A->factortype == MAT_FACTOR_NONE) {
3257     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3258   } else {
3259     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3260   }
3261   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3262   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3263   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3264   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3265   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3266   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3267   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3268   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3269   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3270   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3271   PetscCall(MatDestroy_SeqAIJ(A));
3272   PetscFunctionReturn(0);
3273 }
3274 
3275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3276 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3277 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3278 {
3279   PetscFunctionBegin;
3280   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3281   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3282   PetscFunctionReturn(0);
3283 }
3284 
3285 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3286 {
3287   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3288   Mat_SeqAIJCUSPARSE *cy;
3289   Mat_SeqAIJCUSPARSE *cx;
3290   PetscScalar        *ay;
3291   const PetscScalar  *ax;
3292   CsrMatrix          *csry,*csrx;
3293 
3294   PetscFunctionBegin;
3295   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3296   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3297   if (X->ops->axpy != Y->ops->axpy) {
3298     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3299     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3300     PetscFunctionReturn(0);
3301   }
3302   /* if we are here, it means both matrices are bound to GPU */
3303   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3304   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3305   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3306   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3307   csry = (CsrMatrix*)cy->mat->mat;
3308   csrx = (CsrMatrix*)cx->mat->mat;
3309   /* see if we can turn this into a cublas axpy */
3310   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3311     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3312     if (eq) {
3313       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3314     }
3315     if (eq) str = SAME_NONZERO_PATTERN;
3316   }
3317   /* spgeam is buggy with one column */
3318   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3319 
3320   if (str == SUBSET_NONZERO_PATTERN) {
3321     PetscScalar b = 1.0;
3322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3323     size_t      bufferSize;
3324     void        *buffer;
3325 #endif
3326 
3327     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3328     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3329     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3331     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3332                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3333                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3334                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3335     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3336     PetscCall(PetscLogGpuTimeBegin());
3337     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3338                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3339                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3340                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3341     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3342     PetscCall(PetscLogGpuTimeEnd());
3343     PetscCallCUDA(cudaFree(buffer));
3344 #else
3345     PetscCall(PetscLogGpuTimeBegin());
3346     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3347                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3348                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3349                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3350     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3351     PetscCall(PetscLogGpuTimeEnd());
3352 #endif
3353     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3354     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3355     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3356     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3357   } else if (str == SAME_NONZERO_PATTERN) {
3358     cublasHandle_t cublasv2handle;
3359     PetscBLASInt   one = 1, bnz = 1;
3360 
3361     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3362     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3363     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3364     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3365     PetscCall(PetscLogGpuTimeBegin());
3366     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3367     PetscCall(PetscLogGpuFlops(2.0*bnz));
3368     PetscCall(PetscLogGpuTimeEnd());
3369     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3370     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3371     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3372   } else {
3373     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3374     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3375   }
3376   PetscFunctionReturn(0);
3377 }
3378 
3379 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3380 {
3381   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3382   PetscScalar    *ay;
3383   cublasHandle_t cublasv2handle;
3384   PetscBLASInt   one = 1, bnz = 1;
3385 
3386   PetscFunctionBegin;
3387   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3388   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3389   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3390   PetscCall(PetscLogGpuTimeBegin());
3391   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3392   PetscCall(PetscLogGpuFlops(bnz));
3393   PetscCall(PetscLogGpuTimeEnd());
3394   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3395   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3396   PetscFunctionReturn(0);
3397 }
3398 
3399 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3400 {
3401   PetscBool      both = PETSC_FALSE;
3402   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3403 
3404   PetscFunctionBegin;
3405   if (A->factortype == MAT_FACTOR_NONE) {
3406     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3407     if (spptr->mat) {
3408       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3409       if (matrix->values) {
3410         both = PETSC_TRUE;
3411         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3412       }
3413     }
3414     if (spptr->matTranspose) {
3415       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3416       if (matrix->values) {
3417         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3418       }
3419     }
3420   }
3421   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3422   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3423   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424   else A->offloadmask = PETSC_OFFLOAD_CPU;
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429 {
3430   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3431 
3432   PetscFunctionBegin;
3433   if (A->factortype != MAT_FACTOR_NONE) {
3434     A->boundtocpu = flg;
3435     PetscFunctionReturn(0);
3436   }
3437   if (flg) {
3438     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3439 
3440     A->ops->scale                     = MatScale_SeqAIJ;
3441     A->ops->axpy                      = MatAXPY_SeqAIJ;
3442     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3443     A->ops->mult                      = MatMult_SeqAIJ;
3444     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3445     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3446     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3447     A->ops->multhermitiantranspose    = NULL;
3448     A->ops->multhermitiantransposeadd = NULL;
3449     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3450     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3451     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3452     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3453     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3454     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3455     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3456     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3457   } else {
3458     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3459     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3460     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3461     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3462     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3463     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3464     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3465     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3466     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3467     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3468     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3469     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3470     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3471     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3472     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3473     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3474     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3475 
3476     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3477     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3478     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3479     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3480     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3481     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3482    }
3483   A->boundtocpu = flg;
3484   if (flg && a->inode.size) {
3485     a->inode.use = PETSC_TRUE;
3486   } else {
3487     a->inode.use = PETSC_FALSE;
3488   }
3489   PetscFunctionReturn(0);
3490 }
3491 
3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3493 {
3494   Mat              B;
3495 
3496   PetscFunctionBegin;
3497   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3498   if (reuse == MAT_INITIAL_MATRIX) {
3499     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3500   } else if (reuse == MAT_REUSE_MATRIX) {
3501     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3502   }
3503   B = *newmat;
3504 
3505   PetscCall(PetscFree(B->defaultvectype));
3506   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3507 
3508   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3509     if (B->factortype == MAT_FACTOR_NONE) {
3510       Mat_SeqAIJCUSPARSE *spptr;
3511       PetscCall(PetscNew(&spptr));
3512       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3513       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3514       spptr->format     = MAT_CUSPARSE_CSR;
3515      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3516      #if CUSPARSE_VERSION > 11301
3517       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3518      #else
3519       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3520      #endif
3521       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3522       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3523      #endif
3524       B->spptr = spptr;
3525     } else {
3526       Mat_SeqAIJCUSPARSETriFactors *spptr;
3527 
3528       PetscCall(PetscNew(&spptr));
3529       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3530       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3531       B->spptr = spptr;
3532     }
3533     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3534   }
3535   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3536   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3537   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3538   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3539   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3540   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3541 
3542   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3543   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3544   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3545 #if defined(PETSC_HAVE_HYPRE)
3546   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3547 #endif
3548   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3549   PetscFunctionReturn(0);
3550 }
3551 
3552 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3553 {
3554   PetscFunctionBegin;
3555   PetscCall(MatCreate_SeqAIJ(B));
3556   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3557   PetscFunctionReturn(0);
3558 }
3559 
3560 /*MC
3561    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3562 
3563    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3564    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3565    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3566 
3567    Options Database Keys:
3568 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3569 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3570 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3571 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3572 
3573   Level: beginner
3574 
3575 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3576 M*/
3577 
3578 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3579 
3580 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3581 {
3582   PetscFunctionBegin;
3583   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3584   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3585   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3586   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3587   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3588 
3589   PetscFunctionReturn(0);
3590 }
3591 
3592 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3593 {
3594   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3595 
3596   PetscFunctionBegin;
3597   if (!cusp) PetscFunctionReturn(0);
3598   delete cusp->cooPerm;
3599   delete cusp->cooPerm_a;
3600   cusp->cooPerm = NULL;
3601   cusp->cooPerm_a = NULL;
3602   if (cusp->use_extended_coo) {
3603     PetscCallCUDA(cudaFree(cusp->jmap_d));
3604     PetscCallCUDA(cudaFree(cusp->perm_d));
3605   }
3606   cusp->use_extended_coo = PETSC_FALSE;
3607   PetscFunctionReturn(0);
3608 }
3609 
3610 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3611 {
3612   PetscFunctionBegin;
3613   if (*cusparsestruct) {
3614     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3615     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3616     delete (*cusparsestruct)->workVector;
3617     delete (*cusparsestruct)->rowoffsets_gpu;
3618     delete (*cusparsestruct)->cooPerm;
3619     delete (*cusparsestruct)->cooPerm_a;
3620     delete (*cusparsestruct)->csr2csc_i;
3621     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3622     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3623     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3624     PetscCall(PetscFree(*cusparsestruct));
3625   }
3626   PetscFunctionReturn(0);
3627 }
3628 
3629 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3630 {
3631   PetscFunctionBegin;
3632   if (*mat) {
3633     delete (*mat)->values;
3634     delete (*mat)->column_indices;
3635     delete (*mat)->row_offsets;
3636     delete *mat;
3637     *mat = 0;
3638   }
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3643 {
3644   PetscFunctionBegin;
3645   if (*trifactor) {
3646     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3647     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3648     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3649     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3650     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3651    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3652     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3653    #endif
3654     PetscCall(PetscFree(*trifactor));
3655   }
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3660 {
3661   CsrMatrix        *mat;
3662 
3663   PetscFunctionBegin;
3664   if (*matstruct) {
3665     if ((*matstruct)->mat) {
3666       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3667        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3668         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3669        #else
3670         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3671         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3672        #endif
3673       } else {
3674         mat = (CsrMatrix*)(*matstruct)->mat;
3675         CsrMatrix_Destroy(&mat);
3676       }
3677     }
3678     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3679     delete (*matstruct)->cprowIndices;
3680     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3681     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3682     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3683 
3684    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3685     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3686     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3687     for (int i=0; i<3; i++) {
3688       if (mdata->cuSpMV[i].initialized) {
3689         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3690         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3691         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3692       }
3693     }
3694    #endif
3695     delete *matstruct;
3696     *matstruct = NULL;
3697   }
3698   PetscFunctionReturn(0);
3699 }
3700 
3701 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3702 {
3703   PetscFunctionBegin;
3704   if (*trifactors) {
3705     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3706     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3707     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3708     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3709     delete (*trifactors)->rpermIndices;
3710     delete (*trifactors)->cpermIndices;
3711     delete (*trifactors)->workVector;
3712     (*trifactors)->rpermIndices = NULL;
3713     (*trifactors)->cpermIndices = NULL;
3714     (*trifactors)->workVector = NULL;
3715     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3716     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3717     (*trifactors)->init_dev_prop = PETSC_FALSE;
3718   }
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3723 {
3724   cusparseHandle_t handle;
3725 
3726   PetscFunctionBegin;
3727   if (*trifactors) {
3728     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3729     if (handle = (*trifactors)->handle) {
3730       PetscCallCUSPARSE(cusparseDestroy(handle));
3731     }
3732     PetscCall(PetscFree(*trifactors));
3733   }
3734   PetscFunctionReturn(0);
3735 }
3736 
3737 struct IJCompare
3738 {
3739   __host__ __device__
3740   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3741   {
3742     if (t1.get<0>() < t2.get<0>()) return true;
3743     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3744     return false;
3745   }
3746 };
3747 
3748 struct IJEqual
3749 {
3750   __host__ __device__
3751   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3752   {
3753     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3754     return true;
3755   }
3756 };
3757 
3758 struct IJDiff
3759 {
3760   __host__ __device__
3761   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3762   {
3763     return t1 == t2 ? 0 : 1;
3764   }
3765 };
3766 
3767 struct IJSum
3768 {
3769   __host__ __device__
3770   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3771   {
3772     return t1||t2;
3773   }
3774 };
3775 
3776 #include <thrust/iterator/discard_iterator.h>
3777 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3778 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3779 {
3780   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3782   THRUSTARRAY                           *cooPerm_v = NULL;
3783   thrust::device_ptr<const PetscScalar> d_v;
3784   CsrMatrix                             *matrix;
3785   PetscInt                              n;
3786 
3787   PetscFunctionBegin;
3788   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3789   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3790   if (!cusp->cooPerm) {
3791     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3792     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3793     PetscFunctionReturn(0);
3794   }
3795   matrix = (CsrMatrix*)cusp->mat->mat;
3796   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3797   if (!v) {
3798     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3799     goto finalize;
3800   }
3801   n = cusp->cooPerm->size();
3802   if (isCudaMem(v)) {
3803     d_v = thrust::device_pointer_cast(v);
3804   } else {
3805     cooPerm_v = new THRUSTARRAY(n);
3806     cooPerm_v->assign(v,v+n);
3807     d_v = cooPerm_v->data();
3808     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3809   }
3810   PetscCall(PetscLogGpuTimeBegin());
3811   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3812     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3813       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3814       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3815       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3816         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3817         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3818       */
3819       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3820       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3821       delete cooPerm_w;
3822     } else {
3823       /* all nonzeros in d_v[] are unique entries */
3824       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3825                                                                 matrix->values->begin()));
3826       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3827                                                                 matrix->values->end()));
3828       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3829     }
3830   } else {
3831     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3832       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3833       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3834     } else {
3835       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3836                                                                 matrix->values->begin()));
3837       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3838                                                                 matrix->values->end()));
3839       thrust::for_each(zibit,zieit,VecCUDAEquals());
3840     }
3841   }
3842   PetscCall(PetscLogGpuTimeEnd());
3843 finalize:
3844   delete cooPerm_v;
3845   A->offloadmask = PETSC_OFFLOAD_GPU;
3846   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3847   /* shorter version of MatAssemblyEnd_SeqAIJ */
3848   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3849   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3850   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3851   a->reallocs         = 0;
3852   A->info.mallocs    += 0;
3853   A->info.nz_unneeded = 0;
3854   A->assembled = A->was_assembled = PETSC_TRUE;
3855   A->num_ass++;
3856   PetscFunctionReturn(0);
3857 }
3858 
3859 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3860 {
3861   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3862 
3863   PetscFunctionBegin;
3864   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3865   if (!cusp) PetscFunctionReturn(0);
3866   if (destroy) {
3867     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3868     delete cusp->csr2csc_i;
3869     cusp->csr2csc_i = NULL;
3870   }
3871   A->transupdated = PETSC_FALSE;
3872   PetscFunctionReturn(0);
3873 }
3874 
3875 #include <thrust/binary_search.h>
3876 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3877 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3878 {
3879   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3880   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3881   PetscInt           cooPerm_n, nzr = 0;
3882 
3883   PetscFunctionBegin;
3884   PetscCall(PetscLayoutSetUp(A->rmap));
3885   PetscCall(PetscLayoutSetUp(A->cmap));
3886   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3887   if (n != cooPerm_n) {
3888     delete cusp->cooPerm;
3889     delete cusp->cooPerm_a;
3890     cusp->cooPerm = NULL;
3891     cusp->cooPerm_a = NULL;
3892   }
3893   if (n) {
3894     THRUSTINTARRAY d_i(n);
3895     THRUSTINTARRAY d_j(n);
3896     THRUSTINTARRAY ii(A->rmap->n);
3897 
3898     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3899     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3900 
3901     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3902     d_i.assign(coo_i,coo_i+n);
3903     d_j.assign(coo_j,coo_j+n);
3904 
3905     /* Ex.
3906       n = 6
3907       coo_i = [3,3,1,4,1,4]
3908       coo_j = [3,2,2,5,2,6]
3909     */
3910     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3911     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3912 
3913     PetscCall(PetscLogGpuTimeBegin());
3914     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3915     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3916     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3917     THRUSTINTARRAY w = d_j;
3918 
3919     /*
3920       d_i     = [1,1,3,3,4,4]
3921       d_j     = [2,2,2,3,5,6]
3922       cooPerm = [2,4,1,0,3,5]
3923     */
3924     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3925 
3926     /*
3927       d_i     = [1,3,3,4,4,x]
3928                             ^ekey
3929       d_j     = [2,2,3,5,6,x]
3930                            ^nekye
3931     */
3932     if (nekey == ekey) { /* all entries are unique */
3933       delete cusp->cooPerm_a;
3934       cusp->cooPerm_a = NULL;
3935     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3936       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3937       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3938       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3939       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3940       w[0] = 0;
3941       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3942       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3943     }
3944     thrust::counting_iterator<PetscInt> search_begin(0);
3945     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3946                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3947                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3948     PetscCall(PetscLogGpuTimeEnd());
3949 
3950     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3951     a->singlemalloc = PETSC_FALSE;
3952     a->free_a       = PETSC_TRUE;
3953     a->free_ij      = PETSC_TRUE;
3954     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3955     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3956     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3957     a->nz = a->maxnz = a->i[A->rmap->n];
3958     a->rmax = 0;
3959     PetscCall(PetscMalloc1(a->nz,&a->a));
3960     PetscCall(PetscMalloc1(a->nz,&a->j));
3961     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3962     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3963     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3964     for (PetscInt i = 0; i < A->rmap->n; i++) {
3965       const PetscInt nnzr = a->i[i+1] - a->i[i];
3966       nzr += (PetscInt)!!(nnzr);
3967       a->ilen[i] = a->imax[i] = nnzr;
3968       a->rmax = PetscMax(a->rmax,nnzr);
3969     }
3970     a->nonzerorowcnt = nzr;
3971     A->preallocated = PETSC_TRUE;
3972     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3973     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3974   } else {
3975     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3976   }
3977   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3978 
3979   /* We want to allocate the CUSPARSE struct for matvec now.
3980      The code is so convoluted now that I prefer to copy zeros */
3981   PetscCall(PetscArrayzero(a->a,a->nz));
3982   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3983   A->offloadmask = PETSC_OFFLOAD_CPU;
3984   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3985   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3986   PetscFunctionReturn(0);
3987 }
3988 
3989 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3990 {
3991   Mat_SeqAIJ         *seq;
3992   Mat_SeqAIJCUSPARSE *dev;
3993   PetscBool          coo_basic = PETSC_TRUE;
3994   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3995 
3996   PetscFunctionBegin;
3997   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
3998   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
3999   if (coo_i) {
4000     PetscCall(PetscGetMemType(coo_i,&mtype));
4001     if (PetscMemTypeHost(mtype)) {
4002       for (PetscCount k=0; k<coo_n; k++) {
4003         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4004       }
4005     }
4006   }
4007 
4008   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4009     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4010   } else {
4011     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4012     mat->offloadmask = PETSC_OFFLOAD_CPU;
4013     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4014     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4015     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4016     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4017     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4018     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4019     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4020     dev->use_extended_coo = PETSC_TRUE;
4021   }
4022   PetscFunctionReturn(0);
4023 }
4024 
4025 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4026 {
4027   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4028   const PetscCount  grid_size = gridDim.x * blockDim.x;
4029   for (; i<nnz; i+= grid_size) {
4030     PetscScalar sum = 0.0;
4031     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4032     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4033   }
4034 }
4035 
4036 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4037 {
4038   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4039   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4040   PetscCount          Annz = seq->nz;
4041   PetscMemType        memtype;
4042   const PetscScalar   *v1 = v;
4043   PetscScalar         *Aa;
4044 
4045   PetscFunctionBegin;
4046   if (dev->use_extended_coo) {
4047     PetscCall(PetscGetMemType(v,&memtype));
4048     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4049       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4050       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4051     }
4052 
4053     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4054     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4055 
4056     if (Annz) {
4057       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4058       PetscCallCUDA(cudaPeekAtLastError());
4059     }
4060 
4061     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4062     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4063 
4064     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4065   } else {
4066     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4067   }
4068   PetscFunctionReturn(0);
4069 }
4070 
4071 /*@C
4072     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4073 
4074    Not collective
4075 
4076     Input Parameters:
4077 +   A - the matrix
4078 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4079 
4080     Output Parameters:
4081 +   ia - the CSR row pointers
4082 -   ja - the CSR column indices
4083 
4084     Level: developer
4085 
4086     Notes:
4087       When compressed is true, the CSR structure does not contain empty rows
4088 
4089 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4090 @*/
4091 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4092 {
4093   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4094   CsrMatrix          *csr;
4095   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4096 
4097   PetscFunctionBegin;
4098   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4099   if (!i || !j) PetscFunctionReturn(0);
4100   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4101   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4102   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4103   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4104   csr = (CsrMatrix*)cusp->mat->mat;
4105   if (i) {
4106     if (!compressed && a->compressedrow.use) { /* need full row offset */
4107       if (!cusp->rowoffsets_gpu) {
4108         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4109         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4110         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4111       }
4112       *i = cusp->rowoffsets_gpu->data().get();
4113     } else *i = csr->row_offsets->data().get();
4114   }
4115   if (j) *j = csr->column_indices->data().get();
4116   PetscFunctionReturn(0);
4117 }
4118 
4119 /*@C
4120     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4121 
4122    Not collective
4123 
4124     Input Parameters:
4125 +   A - the matrix
4126 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4127 
4128     Output Parameters:
4129 +   ia - the CSR row pointers
4130 -   ja - the CSR column indices
4131 
4132     Level: developer
4133 
4134 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4135 @*/
4136 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4137 {
4138   PetscFunctionBegin;
4139   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4140   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4141   if (i) *i = NULL;
4142   if (j) *j = NULL;
4143   PetscFunctionReturn(0);
4144 }
4145 
4146 /*@C
4147    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4148 
4149    Not Collective
4150 
4151    Input Parameter:
4152 .   A - a MATSEQAIJCUSPARSE matrix
4153 
4154    Output Parameter:
4155 .   a - pointer to the device data
4156 
4157    Level: developer
4158 
4159    Notes: may trigger host-device copies if up-to-date matrix data is on host
4160 
4161 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4162 @*/
4163 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4164 {
4165   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4166   CsrMatrix          *csr;
4167 
4168   PetscFunctionBegin;
4169   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4170   PetscValidPointer(a,2);
4171   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4172   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4173   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4174   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4175   csr = (CsrMatrix*)cusp->mat->mat;
4176   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4177   *a = csr->values->data().get();
4178   PetscFunctionReturn(0);
4179 }
4180 
4181 /*@C
4182    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4183 
4184    Not Collective
4185 
4186    Input Parameter:
4187 .   A - a MATSEQAIJCUSPARSE matrix
4188 
4189    Output Parameter:
4190 .   a - pointer to the device data
4191 
4192    Level: developer
4193 
4194 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4195 @*/
4196 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4197 {
4198   PetscFunctionBegin;
4199   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4200   PetscValidPointer(a,2);
4201   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4202   *a = NULL;
4203   PetscFunctionReturn(0);
4204 }
4205 
4206 /*@C
4207    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4208 
4209    Not Collective
4210 
4211    Input Parameter:
4212 .   A - a MATSEQAIJCUSPARSE matrix
4213 
4214    Output Parameter:
4215 .   a - pointer to the device data
4216 
4217    Level: developer
4218 
4219    Notes: may trigger host-device copies if up-to-date matrix data is on host
4220 
4221 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4222 @*/
4223 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4224 {
4225   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4226   CsrMatrix          *csr;
4227 
4228   PetscFunctionBegin;
4229   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4230   PetscValidPointer(a,2);
4231   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4232   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4233   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4234   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4235   csr = (CsrMatrix*)cusp->mat->mat;
4236   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4237   *a = csr->values->data().get();
4238   A->offloadmask = PETSC_OFFLOAD_GPU;
4239   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4240   PetscFunctionReturn(0);
4241 }
4242 /*@C
4243    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4244 
4245    Not Collective
4246 
4247    Input Parameter:
4248 .   A - a MATSEQAIJCUSPARSE matrix
4249 
4250    Output Parameter:
4251 .   a - pointer to the device data
4252 
4253    Level: developer
4254 
4255 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4256 @*/
4257 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4258 {
4259   PetscFunctionBegin;
4260   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4261   PetscValidPointer(a,2);
4262   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4263   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4264   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4265   *a = NULL;
4266   PetscFunctionReturn(0);
4267 }
4268 
4269 /*@C
4270    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4271 
4272    Not Collective
4273 
4274    Input Parameter:
4275 .   A - a MATSEQAIJCUSPARSE matrix
4276 
4277    Output Parameter:
4278 .   a - pointer to the device data
4279 
4280    Level: developer
4281 
4282    Notes: does not trigger host-device copies and flags data validity on the GPU
4283 
4284 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4285 @*/
4286 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4287 {
4288   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4289   CsrMatrix          *csr;
4290 
4291   PetscFunctionBegin;
4292   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4293   PetscValidPointer(a,2);
4294   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4295   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4296   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4297   csr = (CsrMatrix*)cusp->mat->mat;
4298   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4299   *a = csr->values->data().get();
4300   A->offloadmask = PETSC_OFFLOAD_GPU;
4301   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 /*@C
4306    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4307 
4308    Not Collective
4309 
4310    Input Parameter:
4311 .   A - a MATSEQAIJCUSPARSE matrix
4312 
4313    Output Parameter:
4314 .   a - pointer to the device data
4315 
4316    Level: developer
4317 
4318 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4319 @*/
4320 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4321 {
4322   PetscFunctionBegin;
4323   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4324   PetscValidPointer(a,2);
4325   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4326   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4327   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4328   *a = NULL;
4329   PetscFunctionReturn(0);
4330 }
4331 
4332 struct IJCompare4
4333 {
4334   __host__ __device__
4335   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4336   {
4337     if (t1.get<0>() < t2.get<0>()) return true;
4338     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4339     return false;
4340   }
4341 };
4342 
4343 struct Shift
4344 {
4345   int _shift;
4346 
4347   Shift(int shift) : _shift(shift) {}
4348   __host__ __device__
4349   inline int operator() (const int &c)
4350   {
4351     return c + _shift;
4352   }
4353 };
4354 
4355 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4356 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4357 {
4358   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4359   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4360   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4361   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4362   PetscInt                     Annz,Bnnz;
4363   cusparseStatus_t             stat;
4364   PetscInt                     i,m,n,zero = 0;
4365 
4366   PetscFunctionBegin;
4367   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4368   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4369   PetscValidPointer(C,4);
4370   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4371   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4372   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4373   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4374   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4375   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4376   if (reuse == MAT_INITIAL_MATRIX) {
4377     m     = A->rmap->n;
4378     n     = A->cmap->n + B->cmap->n;
4379     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4380     PetscCall(MatSetSizes(*C,m,n,m,n));
4381     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4382     c     = (Mat_SeqAIJ*)(*C)->data;
4383     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4384     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4385     Ccsr  = new CsrMatrix;
4386     Cmat->cprowIndices      = NULL;
4387     c->compressedrow.use    = PETSC_FALSE;
4388     c->compressedrow.nrows  = 0;
4389     c->compressedrow.i      = NULL;
4390     c->compressedrow.rindex = NULL;
4391     Ccusp->workVector       = NULL;
4392     Ccusp->nrows    = m;
4393     Ccusp->mat      = Cmat;
4394     Ccusp->mat->mat = Ccsr;
4395     Ccsr->num_rows  = m;
4396     Ccsr->num_cols  = n;
4397     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4398     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4399     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4400     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4401     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4402     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4403     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4404     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4405     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4406     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4407     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4408     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4409     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4410 
4411     Acsr = (CsrMatrix*)Acusp->mat->mat;
4412     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4413     Annz = (PetscInt)Acsr->column_indices->size();
4414     Bnnz = (PetscInt)Bcsr->column_indices->size();
4415     c->nz = Annz + Bnnz;
4416     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4417     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4418     Ccsr->values = new THRUSTARRAY(c->nz);
4419     Ccsr->num_entries = c->nz;
4420     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4421     if (c->nz) {
4422       auto Acoo = new THRUSTINTARRAY32(Annz);
4423       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4424       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4425       THRUSTINTARRAY32 *Aroff,*Broff;
4426 
4427       if (a->compressedrow.use) { /* need full row offset */
4428         if (!Acusp->rowoffsets_gpu) {
4429           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4430           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4431           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4432         }
4433         Aroff = Acusp->rowoffsets_gpu;
4434       } else Aroff = Acsr->row_offsets;
4435       if (b->compressedrow.use) { /* need full row offset */
4436         if (!Bcusp->rowoffsets_gpu) {
4437           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4438           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4439           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4440         }
4441         Broff = Bcusp->rowoffsets_gpu;
4442       } else Broff = Bcsr->row_offsets;
4443       PetscCall(PetscLogGpuTimeBegin());
4444       stat = cusparseXcsr2coo(Acusp->handle,
4445                               Aroff->data().get(),
4446                               Annz,
4447                               m,
4448                               Acoo->data().get(),
4449                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4450       stat = cusparseXcsr2coo(Bcusp->handle,
4451                               Broff->data().get(),
4452                               Bnnz,
4453                               m,
4454                               Bcoo->data().get(),
4455                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4456       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4457       auto Aperm = thrust::make_constant_iterator(1);
4458       auto Bperm = thrust::make_constant_iterator(0);
4459 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4460       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4461       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4462 #else
4463       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4464       auto Bcib = Bcsr->column_indices->begin();
4465       auto Bcie = Bcsr->column_indices->end();
4466       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4467 #endif
4468       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4469       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4470       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4471       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4472       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4473       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4474       auto p1 = Ccusp->cooPerm->begin();
4475       auto p2 = Ccusp->cooPerm->begin();
4476       thrust::advance(p2,Annz);
4477       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4478 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4479       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4480 #endif
4481       auto cci = thrust::make_counting_iterator(zero);
4482       auto cce = thrust::make_counting_iterator(c->nz);
4483 #if 0 //Errors on SUMMIT cuda 11.1.0
4484       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4485 #else
4486       auto pred = thrust::identity<int>();
4487       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4488       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4489 #endif
4490       stat = cusparseXcoo2csr(Ccusp->handle,
4491                               Ccoo->data().get(),
4492                               c->nz,
4493                               m,
4494                               Ccsr->row_offsets->data().get(),
4495                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4496       PetscCall(PetscLogGpuTimeEnd());
4497       delete wPerm;
4498       delete Acoo;
4499       delete Bcoo;
4500       delete Ccoo;
4501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4502       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4503                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4504                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4505                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4506 #endif
4507       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4508         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4509         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4510         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4511         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4512         CsrMatrix *CcsrT = new CsrMatrix;
4513         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4514         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4515 
4516         (*C)->form_explicit_transpose = PETSC_TRUE;
4517         (*C)->transupdated = PETSC_TRUE;
4518         Ccusp->rowoffsets_gpu = NULL;
4519         CmatT->cprowIndices = NULL;
4520         CmatT->mat = CcsrT;
4521         CcsrT->num_rows = n;
4522         CcsrT->num_cols = m;
4523         CcsrT->num_entries = c->nz;
4524 
4525         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4526         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4527         CcsrT->values = new THRUSTARRAY(c->nz);
4528 
4529         PetscCall(PetscLogGpuTimeBegin());
4530         auto rT = CcsrT->row_offsets->begin();
4531         if (AT) {
4532           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4533           thrust::advance(rT,-1);
4534         }
4535         if (BT) {
4536           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4537           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4538           thrust::copy(titb,tite,rT);
4539         }
4540         auto cT = CcsrT->column_indices->begin();
4541         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4542         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4543         auto vT = CcsrT->values->begin();
4544         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4545         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4546         PetscCall(PetscLogGpuTimeEnd());
4547 
4548         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4549         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4550         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4551         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4552         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4553         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4554         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4555         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4556         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4557 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4558         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4559                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4560                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4561                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4562 #endif
4563         Ccusp->matTranspose = CmatT;
4564       }
4565     }
4566 
4567     c->singlemalloc = PETSC_FALSE;
4568     c->free_a       = PETSC_TRUE;
4569     c->free_ij      = PETSC_TRUE;
4570     PetscCall(PetscMalloc1(m+1,&c->i));
4571     PetscCall(PetscMalloc1(c->nz,&c->j));
4572     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4573       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4574       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4575       ii   = *Ccsr->row_offsets;
4576       jj   = *Ccsr->column_indices;
4577       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4578       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4579     } else {
4580       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4581       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4582     }
4583     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4584     PetscCall(PetscMalloc1(m,&c->ilen));
4585     PetscCall(PetscMalloc1(m,&c->imax));
4586     c->maxnz = c->nz;
4587     c->nonzerorowcnt = 0;
4588     c->rmax = 0;
4589     for (i = 0; i < m; i++) {
4590       const PetscInt nn = c->i[i+1] - c->i[i];
4591       c->ilen[i] = c->imax[i] = nn;
4592       c->nonzerorowcnt += (PetscInt)!!nn;
4593       c->rmax = PetscMax(c->rmax,nn);
4594     }
4595     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4596     PetscCall(PetscMalloc1(c->nz,&c->a));
4597     (*C)->nonzerostate++;
4598     PetscCall(PetscLayoutSetUp((*C)->rmap));
4599     PetscCall(PetscLayoutSetUp((*C)->cmap));
4600     Ccusp->nonzerostate = (*C)->nonzerostate;
4601     (*C)->preallocated  = PETSC_TRUE;
4602   } else {
4603     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4604     c = (Mat_SeqAIJ*)(*C)->data;
4605     if (c->nz) {
4606       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4607       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4608       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4609       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4610       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4611       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4612       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4613       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4614       Acsr = (CsrMatrix*)Acusp->mat->mat;
4615       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4616       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4617       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4618       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4619       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4620       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4621       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4622       auto pmid = Ccusp->cooPerm->begin();
4623       thrust::advance(pmid,Acsr->num_entries);
4624       PetscCall(PetscLogGpuTimeBegin());
4625       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4626                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4627       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4628                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4629       thrust::for_each(zibait,zieait,VecCUDAEquals());
4630       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4631                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4632       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4633                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4634       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4635       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4636       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4637         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4638         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4639         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4640         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4641         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4642         auto vT = CcsrT->values->begin();
4643         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4644         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4645         (*C)->transupdated = PETSC_TRUE;
4646       }
4647       PetscCall(PetscLogGpuTimeEnd());
4648     }
4649   }
4650   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4651   (*C)->assembled     = PETSC_TRUE;
4652   (*C)->was_assembled = PETSC_FALSE;
4653   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4654   PetscFunctionReturn(0);
4655 }
4656 
4657 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4658 {
4659   bool              dmem;
4660   const PetscScalar *av;
4661 
4662   PetscFunctionBegin;
4663   dmem = isCudaMem(v);
4664   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4665   if (n && idx) {
4666     THRUSTINTARRAY widx(n);
4667     widx.assign(idx,idx+n);
4668     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4669 
4670     THRUSTARRAY *w = NULL;
4671     thrust::device_ptr<PetscScalar> dv;
4672     if (dmem) {
4673       dv = thrust::device_pointer_cast(v);
4674     } else {
4675       w = new THRUSTARRAY(n);
4676       dv = w->data();
4677     }
4678     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4679 
4680     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4681     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4682     thrust::for_each(zibit,zieit,VecCUDAEquals());
4683     if (w) {
4684       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4685     }
4686     delete w;
4687   } else {
4688     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4689   }
4690   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4691   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4692   PetscFunctionReturn(0);
4693 }
4694