xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision f93f85713de9e9013005b55e898d32f7c1354932)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
969ae82921SPaul Mullowney {
979ae82921SPaul Mullowney   PetscFunctionBegin;
989ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
999ae82921SPaul Mullowney   PetscFunctionReturn(0);
1009ae82921SPaul Mullowney }
1019ae82921SPaul Mullowney 
102c708e6cdSJed Brown /*MC
103087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
109c708e6cdSJed Brown 
1109ae82921SPaul Mullowney   Level: beginner
111c708e6cdSJed Brown 
112db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
113c708e6cdSJed Brown M*/
1149ae82921SPaul Mullowney 
11542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1169ae82921SPaul Mullowney {
117bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1189ae82921SPaul Mullowney 
1199ae82921SPaul Mullowney   PetscFunctionBegin;
1209566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1219566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1222c7c0729SBarry Smith   (*B)->factortype = ftype;
1239566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
1242205254eSKarl Rupp 
1259566063dSJacob Faibussowitsch   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1279566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1289c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1299ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1309ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1319c1083e7SRichard Tran Mills     } else {
1329c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1339c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1349c1083e7SRichard Tran Mills     }
1359566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1369566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1379566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1399c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
140087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1429c1083e7SRichard Tran Mills     } else {
1439c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1449c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1459c1083e7SRichard Tran Mills     }
1469566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1479566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1489ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149bc3f50f2SPaul Mullowney 
1509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1514ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
1539ae82921SPaul Mullowney   PetscFunctionReturn(0);
1549ae82921SPaul Mullowney }
1559ae82921SPaul Mullowney 
156bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157ca45077fSPaul Mullowney {
158aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1596e111a19SKarl Rupp 
160ca45077fSPaul Mullowney   PetscFunctionBegin;
161ca45077fSPaul Mullowney   switch (op) {
162e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
163aa372e3fSPaul Mullowney     cusparsestruct->format = format;
164ca45077fSPaul Mullowney     break;
165e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
166aa372e3fSPaul Mullowney     cusparsestruct->format = format;
167ca45077fSPaul Mullowney     break;
168ca45077fSPaul Mullowney   default:
16998921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170ca45077fSPaul Mullowney   }
171ca45077fSPaul Mullowney   PetscFunctionReturn(0);
172ca45077fSPaul Mullowney }
1739ae82921SPaul Mullowney 
174e057df02SPaul Mullowney /*@
175e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
177aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
178e057df02SPaul Mullowney    Not Collective
179e057df02SPaul Mullowney 
180e057df02SPaul Mullowney    Input Parameters:
1818468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
18236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1832692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184e057df02SPaul Mullowney 
185e057df02SPaul Mullowney    Output Parameter:
186e057df02SPaul Mullowney 
187e057df02SPaul Mullowney    Level: intermediate
188e057df02SPaul Mullowney 
189db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
190e057df02SPaul Mullowney @*/
191e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192e057df02SPaul Mullowney {
193e057df02SPaul Mullowney   PetscFunctionBegin;
194e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196e057df02SPaul Mullowney   PetscFunctionReturn(0);
197e057df02SPaul Mullowney }
198e057df02SPaul Mullowney 
199365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200365b711fSMark Adams {
201365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202365b711fSMark Adams 
203365b711fSMark Adams   PetscFunctionBegin;
204365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
205365b711fSMark Adams   PetscFunctionReturn(0);
206365b711fSMark Adams }
207365b711fSMark Adams 
208365b711fSMark Adams /*@
209365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210365b711fSMark Adams 
211365b711fSMark Adams    Input Parameters:
212365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
213365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
214365b711fSMark Adams 
215365b711fSMark Adams    Output Parameter:
216365b711fSMark Adams 
217365b711fSMark Adams    Notes:
218365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
219365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221365b711fSMark Adams 
222365b711fSMark Adams    Level: intermediate
223365b711fSMark Adams 
224db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
225365b711fSMark Adams @*/
226365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227365b711fSMark Adams {
228365b711fSMark Adams   PetscFunctionBegin;
229365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231365b711fSMark Adams   PetscFunctionReturn(0);
232365b711fSMark Adams }
233365b711fSMark Adams 
2341a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235e6e9a74fSStefano Zampini {
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
2371a2c6b5cSJunchao Zhang   switch (op) {
2381a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2391a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2409566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2411a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2421a2c6b5cSJunchao Zhang       break;
2431a2c6b5cSJunchao Zhang     default:
2449566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
2451a2c6b5cSJunchao Zhang       break;
246e6e9a74fSStefano Zampini   }
247e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
248e6e9a74fSStefano Zampini }
249e6e9a74fSStefano Zampini 
250bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251bddcd29dSMark Adams 
252bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253bddcd29dSMark Adams {
254bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
256bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
257365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258bddcd29dSMark Adams 
259bddcd29dSMark Adams   PetscFunctionBegin;
2609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2619566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
263bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2649566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
2659566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
266*f93f8571SJunchao Zhang 
267365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
268*f93f8571SJunchao Zhang     if (row_identity && col_identity) {
269bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
270bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
271bddcd29dSMark Adams     } else {
272bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
273bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
274365b711fSMark Adams     }
275*f93f8571SJunchao Zhang   }
276bddcd29dSMark Adams   B->ops->matsolve = NULL;
277bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
278bddcd29dSMark Adams 
279bddcd29dSMark Adams   /* get the triangular factors */
280365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
2819566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
282365b711fSMark Adams   }
283bddcd29dSMark Adams   PetscFunctionReturn(0);
284bddcd29dSMark Adams }
285bddcd29dSMark Adams 
2864416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2879ae82921SPaul Mullowney {
288e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2899ae82921SPaul Mullowney   PetscBool                flg;
290a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2916e111a19SKarl Rupp 
2929ae82921SPaul Mullowney   PetscFunctionBegin;
293d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
2949ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
295d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
296d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
2979566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
298afb2bd1cSJunchao Zhang 
299d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
300d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
3019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
3029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
3039566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
304afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
305d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
306d0609cedSBarry Smith                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
307afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
308ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
309aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
310a435da06SStefano Zampini #else
311aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
312a435da06SStefano Zampini #endif
313d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
314d0609cedSBarry Smith                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
315aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
316afb2bd1cSJunchao Zhang 
317d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
318d0609cedSBarry Smith                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
319aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
320afb2bd1cSJunchao Zhang    #endif
3214c87dfd4SPaul Mullowney   }
322d0609cedSBarry Smith   PetscOptionsHeadEnd();
3239ae82921SPaul Mullowney   PetscFunctionReturn(0);
3249ae82921SPaul Mullowney }
3259ae82921SPaul Mullowney 
3266fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3279ae82921SPaul Mullowney {
328da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3299ae82921SPaul Mullowney 
3309ae82921SPaul Mullowney   PetscFunctionBegin;
3319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3329566063dSJacob Faibussowitsch   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3339ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3349ae82921SPaul Mullowney   PetscFunctionReturn(0);
3359ae82921SPaul Mullowney }
3369ae82921SPaul Mullowney 
3376fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3389ae82921SPaul Mullowney {
339da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney   PetscFunctionBegin;
3429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3439566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3449ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3459ae82921SPaul Mullowney   PetscFunctionReturn(0);
3469ae82921SPaul Mullowney }
3479ae82921SPaul Mullowney 
348087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
349087f3262SPaul Mullowney {
350da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
351087f3262SPaul Mullowney 
352087f3262SPaul Mullowney   PetscFunctionBegin;
3539566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3549566063dSJacob Faibussowitsch   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
355087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
356087f3262SPaul Mullowney   PetscFunctionReturn(0);
357087f3262SPaul Mullowney }
358087f3262SPaul Mullowney 
359087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
360087f3262SPaul Mullowney {
361da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
362087f3262SPaul Mullowney 
363087f3262SPaul Mullowney   PetscFunctionBegin;
3649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3659566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
366087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
367087f3262SPaul Mullowney   PetscFunctionReturn(0);
368087f3262SPaul Mullowney }
369087f3262SPaul Mullowney 
370087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3719ae82921SPaul Mullowney {
3729ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3739ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3749ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
375aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3769ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3779ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3789ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3799ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney   PetscFunctionBegin;
382cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
383c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3849ae82921SPaul Mullowney     try {
3859ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3869ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
387da79fbbcSStefano Zampini       if (!loTriFactor) {
3882cbc15d9SMark         PetscScalar                       *AALo;
3892cbc15d9SMark 
3909566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
3919ae82921SPaul Mullowney 
3929ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
3949566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3979ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3989ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3999ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4009ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4019ae82921SPaul Mullowney         v        = aa;
4029ae82921SPaul Mullowney         vi       = aj;
4039ae82921SPaul Mullowney         offset   = 1;
4049ae82921SPaul Mullowney         rowOffset= 1;
4059ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4069ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
407e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4089ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4099ae82921SPaul Mullowney           rowOffset += nz+1;
4109ae82921SPaul Mullowney 
4119566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
4129566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
4139ae82921SPaul Mullowney 
4149ae82921SPaul Mullowney           offset      += nz;
4159ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4169ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4179ae82921SPaul Mullowney           offset      += 1;
4189ae82921SPaul Mullowney 
4199ae82921SPaul Mullowney           v  += nz;
4209ae82921SPaul Mullowney           vi += nz;
4219ae82921SPaul Mullowney         }
4222205254eSKarl Rupp 
423aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4249566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
425da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
426aa372e3fSPaul Mullowney         /* Create the matrix description */
4279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
4289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4291b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
431afb2bd1cSJunchao Zhang        #else
4329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
433afb2bd1cSJunchao Zhang        #endif
4349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
4359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
436aa372e3fSPaul Mullowney 
437aa372e3fSPaul Mullowney         /* set the operation */
438aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
439aa372e3fSPaul Mullowney 
440aa372e3fSPaul Mullowney         /* set the matrix */
441aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
442aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
443aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
444aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
445aa372e3fSPaul Mullowney 
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
451aa372e3fSPaul Mullowney 
452aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
453aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
454aa372e3fSPaul Mullowney 
455afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4569566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
4579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
4581b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
460afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
461afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
462afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
4635f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
4649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
465afb2bd1cSJunchao Zhang       #endif
466afb2bd1cSJunchao Zhang 
467aa372e3fSPaul Mullowney         /* perform the solve analysis */
4689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
469aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
470aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
471d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
4721b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
473d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
4745f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
475d49cd2b7SBarry Smith                                          #else
4765f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
477afb2bd1cSJunchao Zhang                                          #endif
4789566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4799566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
480aa372e3fSPaul Mullowney 
481da79fbbcSStefano Zampini         /* assign the pointer */
482aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4832cbc15d9SMark         loTriFactor->AA_h = AALo;
4849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4869566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
487da79fbbcSStefano Zampini       } else { /* update values only */
4882cbc15d9SMark         if (!loTriFactor->AA_h) {
4899566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
4902cbc15d9SMark         }
491da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4922cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
493da79fbbcSStefano Zampini         v        = aa;
494da79fbbcSStefano Zampini         vi       = aj;
495da79fbbcSStefano Zampini         offset   = 1;
496da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
497da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4989566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
499da79fbbcSStefano Zampini           offset      += nz;
5002cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
501da79fbbcSStefano Zampini           offset      += 1;
502da79fbbcSStefano Zampini           v  += nz;
503da79fbbcSStefano Zampini         }
5042cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
5059566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
506da79fbbcSStefano Zampini       }
5079ae82921SPaul Mullowney     } catch(char *ex) {
50898921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5099ae82921SPaul Mullowney     }
5109ae82921SPaul Mullowney   }
5119ae82921SPaul Mullowney   PetscFunctionReturn(0);
5129ae82921SPaul Mullowney }
5139ae82921SPaul Mullowney 
514087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5159ae82921SPaul Mullowney {
5169ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5179ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5189ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
519aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5209ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5219ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5229ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5239ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5249ae82921SPaul Mullowney 
5259ae82921SPaul Mullowney   PetscFunctionBegin;
526cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
527c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5289ae82921SPaul Mullowney     try {
5299ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5309ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
531da79fbbcSStefano Zampini       if (!upTriFactor) {
5322cbc15d9SMark         PetscScalar *AAUp;
5332cbc15d9SMark 
5349566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
5352cbc15d9SMark 
5369ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
5389566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
5399ae82921SPaul Mullowney 
5409ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5419ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5429ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5439ae82921SPaul Mullowney         offset = nzUpper;
5449ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5459ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5469ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5479ae82921SPaul Mullowney 
548e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5499ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5509ae82921SPaul Mullowney 
551e057df02SPaul Mullowney           /* decrement the offset */
5529ae82921SPaul Mullowney           offset -= (nz+1);
5539ae82921SPaul Mullowney 
554e057df02SPaul Mullowney           /* first, set the diagonal elements */
5559ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
55609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5579ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5589ae82921SPaul Mullowney 
5599566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
5609566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
5619ae82921SPaul Mullowney         }
5622205254eSKarl Rupp 
563aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5649566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
565da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5662205254eSKarl Rupp 
567aa372e3fSPaul Mullowney         /* Create the matrix description */
5689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5701b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
5719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
572afb2bd1cSJunchao Zhang        #else
5739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
574afb2bd1cSJunchao Zhang        #endif
5759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
577aa372e3fSPaul Mullowney 
578aa372e3fSPaul Mullowney         /* set the operation */
579aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580aa372e3fSPaul Mullowney 
581aa372e3fSPaul Mullowney         /* set the matrix */
582aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
583aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
584aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
585aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
586aa372e3fSPaul Mullowney 
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595aa372e3fSPaul Mullowney 
596afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5979566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
5989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
5991b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6009566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
6045f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
6059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
606afb2bd1cSJunchao Zhang       #endif
607afb2bd1cSJunchao Zhang 
608aa372e3fSPaul Mullowney         /* perform the solve analysis */
6099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
6131b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
6155f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
616d49cd2b7SBarry Smith                                          #else
6175f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
618afb2bd1cSJunchao Zhang                                          #endif
6199566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6209566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
621aa372e3fSPaul Mullowney 
622da79fbbcSStefano Zampini         /* assign the pointer */
623aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6242cbc15d9SMark         upTriFactor->AA_h = AAUp;
6259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
6269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
6279566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
628da79fbbcSStefano Zampini       } else {
6292cbc15d9SMark         if (!upTriFactor->AA_h) {
6309566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
6312cbc15d9SMark         }
632da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
633da79fbbcSStefano Zampini         offset = nzUpper;
634da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
635da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
636da79fbbcSStefano Zampini 
637da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
638da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
639da79fbbcSStefano Zampini 
640da79fbbcSStefano Zampini           /* decrement the offset */
641da79fbbcSStefano Zampini           offset -= (nz+1);
642da79fbbcSStefano Zampini 
643da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6442cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6459566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
646da79fbbcSStefano Zampini         }
6472cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
6489566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
649da79fbbcSStefano Zampini       }
6509ae82921SPaul Mullowney     } catch(char *ex) {
65198921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6529ae82921SPaul Mullowney     }
6539ae82921SPaul Mullowney   }
6549ae82921SPaul Mullowney   PetscFunctionReturn(0);
6559ae82921SPaul Mullowney }
6569ae82921SPaul Mullowney 
657087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6589ae82921SPaul Mullowney {
6599ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6609ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6619ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6629ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6639ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6649ae82921SPaul Mullowney 
6659ae82921SPaul Mullowney   PetscFunctionBegin;
66628b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
6679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
6689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
6692205254eSKarl Rupp 
670da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
671aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6729ae82921SPaul Mullowney 
673c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
674e057df02SPaul Mullowney   /* lower triangular indices */
6759566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
676da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
677da79fbbcSStefano Zampini     const PetscInt *r;
678da79fbbcSStefano Zampini 
6799566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow,&r));
680aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
681aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6829566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow,&r));
6839566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
684da79fbbcSStefano Zampini   }
6859ae82921SPaul Mullowney 
686e057df02SPaul Mullowney   /* upper triangular indices */
6879566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
688da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
689da79fbbcSStefano Zampini     const PetscInt *c;
690da79fbbcSStefano Zampini 
6919566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol,&c));
692aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
693aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6949566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol,&c));
6959566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
696da79fbbcSStefano Zampini   }
6979ae82921SPaul Mullowney   PetscFunctionReturn(0);
6989ae82921SPaul Mullowney }
6999ae82921SPaul Mullowney 
700087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
701087f3262SPaul Mullowney {
702087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
703087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
705aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
706087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
707087f3262SPaul Mullowney   PetscScalar                       *AAUp;
708087f3262SPaul Mullowney   PetscScalar                       *AALo;
709087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
710087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
711087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
712087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
713087f3262SPaul Mullowney 
714087f3262SPaul Mullowney   PetscFunctionBegin;
715cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
716c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
717087f3262SPaul Mullowney     try {
7189566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
7199566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
720da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
721087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
7239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
724087f3262SPaul Mullowney 
725087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
726087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
727087f3262SPaul Mullowney         AiUp[n]=nzUpper;
728087f3262SPaul Mullowney         offset = 0;
729087f3262SPaul Mullowney         for (i=0; i<n; i++) {
730087f3262SPaul Mullowney           /* set the pointers */
731087f3262SPaul Mullowney           v  = aa + ai[i];
732087f3262SPaul Mullowney           vj = aj + ai[i];
733087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
734087f3262SPaul Mullowney 
735087f3262SPaul Mullowney           /* first, set the diagonal elements */
736087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
73709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
738087f3262SPaul Mullowney           AiUp[i]      = offset;
73909f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
740087f3262SPaul Mullowney 
741087f3262SPaul Mullowney           offset+=1;
742087f3262SPaul Mullowney           if (nz>0) {
7439566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
7449566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
745087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
746087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
747087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
748087f3262SPaul Mullowney             }
749087f3262SPaul Mullowney             offset+=nz;
750087f3262SPaul Mullowney           }
751087f3262SPaul Mullowney         }
752087f3262SPaul Mullowney 
753aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
7549566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
755da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
756087f3262SPaul Mullowney 
757aa372e3fSPaul Mullowney         /* Create the matrix description */
7589566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
7599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
7601b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
762afb2bd1cSJunchao Zhang        #else
7639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
764afb2bd1cSJunchao Zhang        #endif
7659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
7669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
767087f3262SPaul Mullowney 
768aa372e3fSPaul Mullowney         /* set the matrix */
769aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
770aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
771aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
772aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
773aa372e3fSPaul Mullowney 
774aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
775aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
776aa372e3fSPaul Mullowney 
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
778aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
779aa372e3fSPaul Mullowney 
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
781aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
782aa372e3fSPaul Mullowney 
783afb2bd1cSJunchao Zhang         /* set the operation */
784afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
785afb2bd1cSJunchao Zhang 
786afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7879566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
7889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
7891b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
791afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
792afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
793afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
7945f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
7959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
796afb2bd1cSJunchao Zhang       #endif
797afb2bd1cSJunchao Zhang 
798aa372e3fSPaul Mullowney         /* perform the solve analysis */
7999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
800aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
801aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
802d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
8031b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
8055f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
806d49cd2b7SBarry Smith                                          #else
8075f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
808afb2bd1cSJunchao Zhang                                          #endif
8099566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8109566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
811aa372e3fSPaul Mullowney 
812da79fbbcSStefano Zampini         /* assign the pointer */
813aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8169566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
817da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         /* Create the matrix description */
8209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8221b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
824afb2bd1cSJunchao Zhang        #else
8259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
826afb2bd1cSJunchao Zhang        #endif
8279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
829aa372e3fSPaul Mullowney 
830aa372e3fSPaul Mullowney         /* set the operation */
831aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832aa372e3fSPaul Mullowney 
833aa372e3fSPaul Mullowney         /* set the matrix */
834aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
835aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
836aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
837aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
838aa372e3fSPaul Mullowney 
839aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847aa372e3fSPaul Mullowney 
848afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8499566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
8509566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
8511b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
8565f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
8579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
858afb2bd1cSJunchao Zhang       #endif
859afb2bd1cSJunchao Zhang 
860aa372e3fSPaul Mullowney         /* perform the solve analysis */
8619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
8651b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
8675f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
868d49cd2b7SBarry Smith                                          #else
8695f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
870afb2bd1cSJunchao Zhang                                          #endif
8719566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8729566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
873aa372e3fSPaul Mullowney 
874da79fbbcSStefano Zampini         /* assign the pointer */
875aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
876087f3262SPaul Mullowney 
8779566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
8789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
8799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
880da79fbbcSStefano Zampini       } else {
881da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
882da79fbbcSStefano Zampini         offset = 0;
883da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
884da79fbbcSStefano Zampini           /* set the pointers */
885da79fbbcSStefano Zampini           v  = aa + ai[i];
886da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
887da79fbbcSStefano Zampini 
888da79fbbcSStefano Zampini           /* first, set the diagonal elements */
889da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
890da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
891da79fbbcSStefano Zampini 
892da79fbbcSStefano Zampini           offset+=1;
893da79fbbcSStefano Zampini           if (nz>0) {
8949566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
895da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
896da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
897da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
898da79fbbcSStefano Zampini             }
899da79fbbcSStefano Zampini             offset+=nz;
900da79fbbcSStefano Zampini           }
901da79fbbcSStefano Zampini         }
90228b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
90328b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
904da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
905da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
9069566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
907da79fbbcSStefano Zampini       }
9089566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9099566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
910087f3262SPaul Mullowney     } catch(char *ex) {
91198921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
912087f3262SPaul Mullowney     }
913087f3262SPaul Mullowney   }
914087f3262SPaul Mullowney   PetscFunctionReturn(0);
915087f3262SPaul Mullowney }
916087f3262SPaul Mullowney 
917087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9189ae82921SPaul Mullowney {
919087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
920087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
921087f3262SPaul Mullowney   IS                           ip = a->row;
922087f3262SPaul Mullowney   PetscBool                    perm_identity;
923087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
924087f3262SPaul Mullowney 
925087f3262SPaul Mullowney   PetscFunctionBegin;
92628b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
9279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
928da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
929aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
930aa372e3fSPaul Mullowney 
931da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
932da79fbbcSStefano Zampini 
933087f3262SPaul Mullowney   /* lower triangular indices */
9349566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
935087f3262SPaul Mullowney   if (!perm_identity) {
9364e4bbfaaSStefano Zampini     IS             iip;
937da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9384e4bbfaaSStefano Zampini 
9399566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
9409566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip,&irip));
9419566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip,&rip));
942aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
943aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
944aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9454e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9469566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip,&irip));
9479566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
9489566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip,&rip));
9499566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
950da79fbbcSStefano Zampini   }
951087f3262SPaul Mullowney   PetscFunctionReturn(0);
952087f3262SPaul Mullowney }
953087f3262SPaul Mullowney 
954087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
955087f3262SPaul Mullowney {
956087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
957087f3262SPaul Mullowney   IS             ip = b->row;
958087f3262SPaul Mullowney   PetscBool      perm_identity;
959087f3262SPaul Mullowney 
960087f3262SPaul Mullowney   PetscFunctionBegin;
9619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
9629566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
963ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
964087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9659566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
966087f3262SPaul Mullowney   if (perm_identity) {
967087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9694e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9704e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
971087f3262SPaul Mullowney   } else {
972087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9744e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9754e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
976087f3262SPaul Mullowney   }
977087f3262SPaul Mullowney 
978087f3262SPaul Mullowney   /* get the triangular factors */
9799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
980087f3262SPaul Mullowney   PetscFunctionReturn(0);
981087f3262SPaul Mullowney }
9829ae82921SPaul Mullowney 
983b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984bda325fcSPaul Mullowney {
985bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
991aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
992aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
993aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
994b175d8bbSPaul Mullowney 
995bda325fcSPaul Mullowney   PetscFunctionBegin;
996aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
9979566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
998da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
999aa372e3fSPaul Mullowney 
1000aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1001aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1002aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1003aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1004aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1005aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1006aa372e3fSPaul Mullowney 
1007aa372e3fSPaul Mullowney   /* Create the matrix description */
10089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10099566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1013aa372e3fSPaul Mullowney 
1014aa372e3fSPaul Mullowney   /* set the operation */
1015aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1018aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1019afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1020afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1021aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1022afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1023afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1024afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1025aa372e3fSPaul Mullowney 
1026aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1027afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
10289566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1029afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1030afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(),
1031afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->row_offsets->data().get(),
1032afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(),
1033afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->values->data().get(),
1034afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1035afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
10365f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10379566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1038afb2bd1cSJunchao Zhang #endif
1039afb2bd1cSJunchao Zhang 
10409566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
10419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->values->data().get(),
1044aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->row_offsets->data().get(),
1045aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->column_indices->data().get(),
1046aa372e3fSPaul Mullowney                                   loTriFactorT->csrMat->values->data().get(),
1047afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1048afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1049afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
10505f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1051afb2bd1cSJunchao Zhang                                   #else
1052afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10535f80ce2aSJacob Faibussowitsch                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1054afb2bd1cSJunchao Zhang                                   #endif
10559566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10569566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1057aa372e3fSPaul Mullowney 
1058afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
10599566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
10609566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
10611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
10629566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1063afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1064afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1065afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
10665f80ce2aSJacob Faibussowitsch                                          &loTriFactorT->solveBufferSize));
10679566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1068afb2bd1cSJunchao Zhang #endif
1069afb2bd1cSJunchao Zhang 
1070afb2bd1cSJunchao Zhang   /* perform the solve analysis */
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1072afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1073afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1074d49cd2b7SBarry Smith                                    loTriFactorT->csrMat->column_indices->data().get(),
10751b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1076d49cd2b7SBarry Smith                                    loTriFactorT->solveInfo,
10775f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1078d49cd2b7SBarry Smith                                    #else
10795f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solveInfo));
1080afb2bd1cSJunchao Zhang                                    #endif
10819566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10829566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1083aa372e3fSPaul Mullowney 
1084da79fbbcSStefano Zampini   /* assign the pointer */
1085aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1086aa372e3fSPaul Mullowney 
1087aa372e3fSPaul Mullowney   /*********************************************/
1088aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1089aa372e3fSPaul Mullowney   /*********************************************/
1090aa372e3fSPaul Mullowney 
1091aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
10929566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1093da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1094aa372e3fSPaul Mullowney 
1095aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1096aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1097aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1098aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1099aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1100aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1101aa372e3fSPaul Mullowney 
1102aa372e3fSPaul Mullowney   /* Create the matrix description */
11039566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11049566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11059566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11069566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1108aa372e3fSPaul Mullowney 
1109aa372e3fSPaul Mullowney   /* set the operation */
1110aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1111aa372e3fSPaul Mullowney 
1112aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1113aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1114afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1115afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1116aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1117afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1118afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1119afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1120aa372e3fSPaul Mullowney 
1121aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1122afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
11239566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1124afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1125afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(),
1126afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->row_offsets->data().get(),
1127afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(),
1128afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->values->data().get(),
1129afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1130afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
11315f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11329566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1133afb2bd1cSJunchao Zhang #endif
1134afb2bd1cSJunchao Zhang 
11359566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
11369566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1137aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->values->data().get(),
1139aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->row_offsets->data().get(),
1140aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->column_indices->data().get(),
1141aa372e3fSPaul Mullowney                                   upTriFactorT->csrMat->values->data().get(),
1142afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1143afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
11455f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1146afb2bd1cSJunchao Zhang                                   #else
1147afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11485f80ce2aSJacob Faibussowitsch                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1149afb2bd1cSJunchao Zhang                                  #endif
1150d49cd2b7SBarry Smith 
11519566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11529566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1153aa372e3fSPaul Mullowney 
1154afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11559566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
11569566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
11571b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
11589566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1159afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1160afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
11625f80ce2aSJacob Faibussowitsch                                          &upTriFactorT->solveBufferSize));
11639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1164afb2bd1cSJunchao Zhang   #endif
1165afb2bd1cSJunchao Zhang 
1166afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11675f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171d49cd2b7SBarry Smith                                    upTriFactorT->csrMat->column_indices->data().get(),
11721b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173d49cd2b7SBarry Smith                                    upTriFactorT->solveInfo,
11745f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1175d49cd2b7SBarry Smith                                    #else
11765f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solveInfo));
1177afb2bd1cSJunchao Zhang                                    #endif
1178d49cd2b7SBarry Smith 
11799566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11809566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1181aa372e3fSPaul Mullowney 
1182da79fbbcSStefano Zampini   /* assign the pointer */
1183aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1184bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1185bda325fcSPaul Mullowney }
1186bda325fcSPaul Mullowney 
1187a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1188a49f1ed0SStefano Zampini {
1189a49f1ed0SStefano Zampini   __host__ __device__
1190a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1191a49f1ed0SStefano Zampini   {
1192a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1193a49f1ed0SStefano Zampini   }
1194a49f1ed0SStefano Zampini };
1195a49f1ed0SStefano Zampini 
11963606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1197bda325fcSPaul Mullowney {
1198aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1199a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1200bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1201bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1202aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1203b175d8bbSPaul Mullowney 
1204bda325fcSPaul Mullowney   PetscFunctionBegin;
12059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1206a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
120728b400f6SJacob Faibussowitsch   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1208a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
120908401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12101a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
12119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
12129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1213a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
12149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1215a49f1ed0SStefano Zampini   }
1216a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12189566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1219aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12209566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12219566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1222aa372e3fSPaul Mullowney 
1223b06137fdSPaul Mullowney     /* set alpha and beta */
12249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
12259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
12269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
12279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
12289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
12299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1230b06137fdSPaul Mullowney 
1231aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1233a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1234554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1235554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1236aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1237a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1240a3fdcf43SKarl Rupp 
1241039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124281902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243afb2bd1cSJunchao Zhang 
1244afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12453606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1246afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1247afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1248afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1249afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1250afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12519566063dSJacob Faibussowitsch                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12523606e59fSJunchao Zhang       #else
12533606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12543606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12553606e59fSJunchao Zhang 
12563606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12573606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12583606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12593606e59fSJunchao Zhang         */
12603606e59fSJunchao Zhang         if (matrixT->num_entries) {
12613606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
12623606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
12633606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
12643606e59fSJunchao Zhang                                  matrixT->values->data().get(),
12653606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
12669566063dSJacob Faibussowitsch                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12673606e59fSJunchao Zhang 
12683606e59fSJunchao Zhang         } else {
12693606e59fSJunchao Zhang           matstructT->matDescr = NULL;
12703606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12713606e59fSJunchao Zhang         }
12723606e59fSJunchao Zhang       #endif
1273afb2bd1cSJunchao Zhang      #endif
1274aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1275afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1277afb2bd1cSJunchao Zhang    #else
1278aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127951c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128051c6d536SStefano Zampini       /* First convert HYB to CSR */
1281aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1282aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1283aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1284aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1285aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1286aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1287aa372e3fSPaul Mullowney 
1288aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1289aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1290aa372e3fSPaul Mullowney                               temp->values->data().get(),
1291aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
12929566063dSJacob Faibussowitsch                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1293aa372e3fSPaul Mullowney 
1294aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1295aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1296aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1297aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1298aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1300aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1303aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1304aa372e3fSPaul Mullowney                               temp->values->data().get(),
1305aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1306aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1307aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1308aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1309aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
13109566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1311aa372e3fSPaul Mullowney 
1312aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1313aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13149566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1315aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1318aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1319aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1320aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
13219566063dSJacob Faibussowitsch                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1322aa372e3fSPaul Mullowney 
1323aa372e3fSPaul Mullowney       /* assign the pointer */
1324aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13251a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1326aa372e3fSPaul Mullowney       /* delete temporaries */
1327aa372e3fSPaul Mullowney       if (tempT) {
1328aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1329aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1330aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1331aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1332087f3262SPaul Mullowney       }
1333aa372e3fSPaul Mullowney       if (temp) {
1334aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1335aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1336aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1337aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1338aa372e3fSPaul Mullowney       }
1339afb2bd1cSJunchao Zhang      #endif
1340aa372e3fSPaul Mullowney     }
1341a49f1ed0SStefano Zampini   }
1342a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1343a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1344a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
134528b400f6SJacob Faibussowitsch     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
134828b400f6SJacob Faibussowitsch     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
135128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
135228b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1353a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1354a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1355a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
13569566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1357a49f1ed0SStefano Zampini     }
1358a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1359a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1360a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1361a49f1ed0SStefano Zampini 
1362a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1363a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1364a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1365a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1366a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1367a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1368a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1369a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1370a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1371a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1372a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1373a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
13749566063dSJacob Faibussowitsch                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
13759566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1376a49f1ed0SStefano Zampini      #endif
1377a49f1ed0SStefano Zampini 
13781a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13791a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13801a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13811a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13821a2c6b5cSJunchao Zhang 
13831a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13841a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13851a2c6b5cSJunchao Zhang         */
13861a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13871a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13881a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13891a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13901a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1391a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1392a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1393a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1394a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13959566063dSJacob Faibussowitsch                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1396a49f1ed0SStefano Zampini                              #else
1397a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
13989566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1399a49f1ed0SStefano Zampini                              #endif
14001a2c6b5cSJunchao Zhang       } else {
14011a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14021a2c6b5cSJunchao Zhang       }
14031a2c6b5cSJunchao Zhang 
1404a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1405a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1406a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
14079566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1408a49f1ed0SStefano Zampini      #endif
1409a49f1ed0SStefano Zampini     }
1410a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1411a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1412a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1413a49f1ed0SStefano Zampini   }
14149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1416213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1417213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1418aa372e3fSPaul Mullowney   /* assign the pointer */
1419aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14201a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1421bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1422bda325fcSPaul Mullowney }
1423bda325fcSPaul Mullowney 
1424a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14256fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1426bda325fcSPaul Mullowney {
1427c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1428465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1429465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1430465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1431465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1432bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1433bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1434aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1435aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1436aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1437bda325fcSPaul Mullowney 
1438bda325fcSPaul Mullowney   PetscFunctionBegin;
1439aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1440aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
14419566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1442aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1443aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1444bda325fcSPaul Mullowney   }
1445bda325fcSPaul Mullowney 
1446bda325fcSPaul Mullowney   /* Get the GPU pointers */
14479566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
14489566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1449c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1450c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1451bda325fcSPaul Mullowney 
14529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1453aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1454a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1455c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1456c41cb2e2SAlejandro Lamas Daviña                xGPU);
1457aa372e3fSPaul Mullowney 
1458aa372e3fSPaul Mullowney   /* First, solve U */
1459aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1460afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1463afb2bd1cSJunchao Zhang                       #endif
1464afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1465aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1466aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1467aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1468aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1469d49cd2b7SBarry Smith                         xarray,
14701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1471d49cd2b7SBarry Smith                         tempGPU->data().get(),
14729566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1473d49cd2b7SBarry Smith                       #else
14749566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1475afb2bd1cSJunchao Zhang                       #endif
1476aa372e3fSPaul Mullowney 
1477aa372e3fSPaul Mullowney   /* Then, solve L */
1478aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1479afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1482afb2bd1cSJunchao Zhang                       #endif
1483afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1484aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1485aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1486aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1487aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1488d49cd2b7SBarry Smith                         tempGPU->data().get(),
14891b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1490d49cd2b7SBarry Smith                         xarray,
14919566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1492d49cd2b7SBarry Smith                       #else
14939566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1494afb2bd1cSJunchao Zhang                       #endif
1495aa372e3fSPaul Mullowney 
1496aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1497a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1498c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1499aa372e3fSPaul Mullowney                tempGPU->begin());
1500aa372e3fSPaul Mullowney 
1501aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1502a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1503bda325fcSPaul Mullowney 
1504bda325fcSPaul Mullowney   /* restore */
15059566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15069566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1509bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1510bda325fcSPaul Mullowney }
1511bda325fcSPaul Mullowney 
15126fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1513bda325fcSPaul Mullowney {
1514465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1515465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1516bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1517bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1518aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1521bda325fcSPaul Mullowney 
1522bda325fcSPaul Mullowney   PetscFunctionBegin;
1523aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1524aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1526aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1527aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1528bda325fcSPaul Mullowney   }
1529bda325fcSPaul Mullowney 
1530bda325fcSPaul Mullowney   /* Get the GPU pointers */
15319566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
15329566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1533bda325fcSPaul Mullowney 
15349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1535aa372e3fSPaul Mullowney   /* First, solve U */
1536aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1537afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15381b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1540afb2bd1cSJunchao Zhang                       #endif
1541afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1542aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1543aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1544aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1545aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1546d49cd2b7SBarry Smith                         barray,
15471b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548d49cd2b7SBarry Smith                         tempGPU->data().get(),
15499566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1550d49cd2b7SBarry Smith                       #else
15519566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1552afb2bd1cSJunchao Zhang                       #endif
1553aa372e3fSPaul Mullowney 
1554aa372e3fSPaul Mullowney   /* Then, solve L */
1555aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1556afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15571b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1558afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1559afb2bd1cSJunchao Zhang                       #endif
1560afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1561aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1562aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1563aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1564aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1565d49cd2b7SBarry Smith                         tempGPU->data().get(),
15661b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1567d49cd2b7SBarry Smith                         xarray,
15689566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1569d49cd2b7SBarry Smith                       #else
15709566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1571afb2bd1cSJunchao Zhang                       #endif
1572bda325fcSPaul Mullowney 
1573bda325fcSPaul Mullowney   /* restore */
15749566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15759566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1578bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1579bda325fcSPaul Mullowney }
1580bda325fcSPaul Mullowney 
15816fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15829ae82921SPaul Mullowney {
1583465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1584465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1585465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1586465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15879ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15889ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1589aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1590aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1591aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
15929ae82921SPaul Mullowney 
15939ae82921SPaul Mullowney   PetscFunctionBegin;
1594ebc8f436SDominic Meiser 
1595e057df02SPaul Mullowney   /* Get the GPU pointers */
15969566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
15979566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1598c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1599c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16009ae82921SPaul Mullowney 
16019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1602aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1603a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1604c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16054e4bbfaaSStefano Zampini                tempGPU->begin());
1606aa372e3fSPaul Mullowney 
1607aa372e3fSPaul Mullowney   /* Next, solve L */
1608aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1609afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16101b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1612afb2bd1cSJunchao Zhang                       #endif
1613afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1614aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1615aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1616aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1617aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1618d49cd2b7SBarry Smith                         tempGPU->data().get(),
16191b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620d49cd2b7SBarry Smith                          xarray,
16219566063dSJacob Faibussowitsch                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1622d49cd2b7SBarry Smith                       #else
16239566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1624afb2bd1cSJunchao Zhang                       #endif
1625aa372e3fSPaul Mullowney 
1626aa372e3fSPaul Mullowney   /* Then, solve U */
1627aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1628afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16291b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1630afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1631afb2bd1cSJunchao Zhang                       #endif
1632afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1633aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1634aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1635aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1636d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16371b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1638d49cd2b7SBarry Smith                         tempGPU->data().get(),
16399566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1640d49cd2b7SBarry Smith                       #else
16419566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1642afb2bd1cSJunchao Zhang                       #endif
1643d49cd2b7SBarry Smith 
16444e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1645a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16464e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16474e4bbfaaSStefano Zampini                xGPU);
16489ae82921SPaul Mullowney 
16499566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
16509566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
16519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
16539ae82921SPaul Mullowney   PetscFunctionReturn(0);
16549ae82921SPaul Mullowney }
16559ae82921SPaul Mullowney 
16566fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16579ae82921SPaul Mullowney {
1658465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1659465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16609ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1662aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1663aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1664aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
16659ae82921SPaul Mullowney 
16669ae82921SPaul Mullowney   PetscFunctionBegin;
1667e057df02SPaul Mullowney   /* Get the GPU pointers */
16689566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
16699566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
16709ae82921SPaul Mullowney 
16719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1672aa372e3fSPaul Mullowney   /* First, solve L */
1673aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1674afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16751b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1676afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1677afb2bd1cSJunchao Zhang                       #endif
1678afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1679aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1680aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1681aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1682aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1683d49cd2b7SBarry Smith                         barray,
16841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1685d49cd2b7SBarry Smith                         tempGPU->data().get(),
16869566063dSJacob Faibussowitsch                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1687d49cd2b7SBarry Smith                       #else
16889566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1689afb2bd1cSJunchao Zhang                       #endif
1690d49cd2b7SBarry Smith 
1691aa372e3fSPaul Mullowney   /* Next, solve U */
1692aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1693afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16941b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1695afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1696afb2bd1cSJunchao Zhang                       #endif
1697afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1698aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1699aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1700aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1701aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1702d49cd2b7SBarry Smith                         tempGPU->data().get(),
17031b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1704d49cd2b7SBarry Smith                         xarray,
17059566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1706d49cd2b7SBarry Smith                       #else
17079566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1708afb2bd1cSJunchao Zhang                       #endif
17099ae82921SPaul Mullowney 
17109566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
17119566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
17129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
17149ae82921SPaul Mullowney   PetscFunctionReturn(0);
17159ae82921SPaul Mullowney }
17169ae82921SPaul Mullowney 
17177e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17187e8381f9SStefano Zampini {
17197e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17207e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17217e8381f9SStefano Zampini 
17227e8381f9SStefano Zampini   PetscFunctionBegin;
17237e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17247e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17257e8381f9SStefano Zampini 
17269566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
17289566063dSJacob Faibussowitsch     PetscCallCUDA(WaitForCUDA());
17299566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
17309566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17317e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17327e8381f9SStefano Zampini   }
17337e8381f9SStefano Zampini   PetscFunctionReturn(0);
17347e8381f9SStefano Zampini }
17357e8381f9SStefano Zampini 
17367e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17377e8381f9SStefano Zampini {
17387e8381f9SStefano Zampini   PetscFunctionBegin;
17399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
174067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
174167a45760SJunchao Zhang   PetscFunctionReturn(0);
174267a45760SJunchao Zhang }
174367a45760SJunchao Zhang 
174467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
174567a45760SJunchao Zhang {
174667a45760SJunchao Zhang   PetscFunctionBegin;
17477e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
174867a45760SJunchao Zhang   *array         = NULL;
174967a45760SJunchao Zhang   PetscFunctionReturn(0);
175067a45760SJunchao Zhang }
175167a45760SJunchao Zhang 
175267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
175367a45760SJunchao Zhang {
175467a45760SJunchao Zhang   PetscFunctionBegin;
17559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
175667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
175767a45760SJunchao Zhang   PetscFunctionReturn(0);
175867a45760SJunchao Zhang }
175967a45760SJunchao Zhang 
176067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
176167a45760SJunchao Zhang {
176267a45760SJunchao Zhang   PetscFunctionBegin;
176367a45760SJunchao Zhang   *array = NULL;
176467a45760SJunchao Zhang   PetscFunctionReturn(0);
176567a45760SJunchao Zhang }
176667a45760SJunchao Zhang 
176767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
176867a45760SJunchao Zhang {
176967a45760SJunchao Zhang   PetscFunctionBegin;
177067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
177167a45760SJunchao Zhang   PetscFunctionReturn(0);
177267a45760SJunchao Zhang }
177367a45760SJunchao Zhang 
177467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
177567a45760SJunchao Zhang {
177667a45760SJunchao Zhang   PetscFunctionBegin;
177767a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
177867a45760SJunchao Zhang   *array         = NULL;
17797e8381f9SStefano Zampini   PetscFunctionReturn(0);
17807e8381f9SStefano Zampini }
17817e8381f9SStefano Zampini 
17827ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
17837ee59b9bSJunchao Zhang {
17847ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE           *cusp;
17857ee59b9bSJunchao Zhang   CsrMatrix                    *matrix;
17867ee59b9bSJunchao Zhang 
17877ee59b9bSJunchao Zhang   PetscFunctionBegin;
17887ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
17897ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
17907ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
17917ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
17927ee59b9bSJunchao Zhang   matrix = (CsrMatrix*)cusp->mat->mat;
17937ee59b9bSJunchao Zhang 
17947ee59b9bSJunchao Zhang   if (i) {
17957ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
17967ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
17977ee59b9bSJunchao Zhang    #else
17987ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
17997ee59b9bSJunchao Zhang    #endif
18007ee59b9bSJunchao Zhang   }
18017ee59b9bSJunchao Zhang   if (j) {
18027ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
18037ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
18047ee59b9bSJunchao Zhang    #else
18057ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
18067ee59b9bSJunchao Zhang    #endif
18077ee59b9bSJunchao Zhang   }
18087ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
18097ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
18107ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
18117ee59b9bSJunchao Zhang }
18127ee59b9bSJunchao Zhang 
1813042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18149ae82921SPaul Mullowney {
1815aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18167c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18179ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1818213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1819aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1820abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
18219ae82921SPaul Mullowney 
18229ae82921SPaul Mullowney   PetscFunctionBegin;
182328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1824c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1825a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1826a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1827afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
182885ba7357SStefano Zampini 
182908401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
18309566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1831afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
18329566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
18339566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
18349566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
18359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
183634d6c7a5SJose E. Roman     } else {
1837abb89eb1SStefano Zampini       PetscInt nnz;
18389566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
18399566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
18409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
18417c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
184281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1843a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1844a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18459ae82921SPaul Mullowney       try {
18469ae82921SPaul Mullowney         if (a->compressedrow.use) {
18479ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18489ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18499ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18509ae82921SPaul Mullowney         } else {
1851213423ffSJunchao Zhang           m    = A->rmap->n;
1852213423ffSJunchao Zhang           ii   = a->i;
1853e6e9a74fSStefano Zampini           ridx = NULL;
18549ae82921SPaul Mullowney         }
185508401ef6SPierre Jolivet         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1856abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1857abb89eb1SStefano Zampini         else nnz = a->nz;
185808401ef6SPierre Jolivet         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
18599ae82921SPaul Mullowney 
186085ba7357SStefano Zampini         /* create cusparse matrix */
1861abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1862aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
18639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
18649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
18659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
18669ae82921SPaul Mullowney 
18679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
18689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
18699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
18709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18719566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
18729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1874b06137fdSPaul Mullowney 
1875aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1876aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1877aa372e3fSPaul Mullowney           /* set the matrix */
1878afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1879afb2bd1cSJunchao Zhang           mat->num_rows = m;
1880afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1881abb89eb1SStefano Zampini           mat->num_entries = nnz;
1882afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1883afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18849ae82921SPaul Mullowney 
1885abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1886abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1887aa372e3fSPaul Mullowney 
1888abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1889abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1890aa372e3fSPaul Mullowney 
1891aa372e3fSPaul Mullowney           /* assign the pointer */
1892afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1893afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1894afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1895afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1896afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1897afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1898afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1899afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
19009566063dSJacob Faibussowitsch                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1901afb2bd1cSJunchao Zhang           }
1902afb2bd1cSJunchao Zhang          #endif
1903aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1904afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1905afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1906afb2bd1cSJunchao Zhang          #else
1907afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1908afb2bd1cSJunchao Zhang           mat->num_rows = m;
1909afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1910abb89eb1SStefano Zampini           mat->num_entries = nnz;
1911afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1912afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1913aa372e3fSPaul Mullowney 
1914abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1915abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1916aa372e3fSPaul Mullowney 
1917abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1918abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1919aa372e3fSPaul Mullowney 
1920aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
19219566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1922aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1923aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1924afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1925afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1926afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1927afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
19289566063dSJacob Faibussowitsch               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1929aa372e3fSPaul Mullowney           /* assign the pointer */
1930aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1931aa372e3fSPaul Mullowney 
1932afb2bd1cSJunchao Zhang           if (mat) {
1933afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1934afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1935afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1936afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1937087f3262SPaul Mullowney           }
1938afb2bd1cSJunchao Zhang          #endif
1939087f3262SPaul Mullowney         }
1940ca45077fSPaul Mullowney 
1941aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1942213423ffSJunchao Zhang         if (a->compressedrow.use) {
1943213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1944aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1945aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1946213423ffSJunchao Zhang           tmp = m;
1947213423ffSJunchao Zhang         } else {
1948213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1949213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1950213423ffSJunchao Zhang           tmp = 0;
1951213423ffSJunchao Zhang         }
19529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1953aa372e3fSPaul Mullowney 
1954aa372e3fSPaul Mullowney         /* assign the pointer */
1955aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19569ae82921SPaul Mullowney       } catch(char *ex) {
195798921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19589ae82921SPaul Mullowney       }
19599566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
19609566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
196134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
196234d6c7a5SJose E. Roman     }
1963abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19649ae82921SPaul Mullowney   }
19659ae82921SPaul Mullowney   PetscFunctionReturn(0);
19669ae82921SPaul Mullowney }
19679ae82921SPaul Mullowney 
1968c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1969aa372e3fSPaul Mullowney {
1970aa372e3fSPaul Mullowney   template <typename Tuple>
1971aa372e3fSPaul Mullowney   __host__ __device__
1972aa372e3fSPaul Mullowney   void operator()(Tuple t)
1973aa372e3fSPaul Mullowney   {
1974aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1975aa372e3fSPaul Mullowney   }
1976aa372e3fSPaul Mullowney };
1977aa372e3fSPaul Mullowney 
19787e8381f9SStefano Zampini struct VecCUDAEquals
19797e8381f9SStefano Zampini {
19807e8381f9SStefano Zampini   template <typename Tuple>
19817e8381f9SStefano Zampini   __host__ __device__
19827e8381f9SStefano Zampini   void operator()(Tuple t)
19837e8381f9SStefano Zampini   {
19847e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19857e8381f9SStefano Zampini   }
19867e8381f9SStefano Zampini };
19877e8381f9SStefano Zampini 
1988e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1989e6e9a74fSStefano Zampini {
1990e6e9a74fSStefano Zampini   template <typename Tuple>
1991e6e9a74fSStefano Zampini   __host__ __device__
1992e6e9a74fSStefano Zampini   void operator()(Tuple t)
1993e6e9a74fSStefano Zampini   {
1994e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1995e6e9a74fSStefano Zampini   }
1996e6e9a74fSStefano Zampini };
1997e6e9a74fSStefano Zampini 
1998afb2bd1cSJunchao Zhang struct MatMatCusparse {
1999ccdfe979SStefano Zampini   PetscBool             cisdense;
2000ccdfe979SStefano Zampini   PetscScalar           *Bt;
2001ccdfe979SStefano Zampini   Mat                   X;
2002fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2003fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2004fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2005b4285af6SJunchao Zhang 
2006afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2007fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2008afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2009afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2010afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2011afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2012b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2013b4285af6SJunchao Zhang   void                  *dBuffer4;
2014b4285af6SJunchao Zhang   void                  *dBuffer5;
2015b4285af6SJunchao Zhang  #endif
2016fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2017fcdce8c4SStefano Zampini   void                  *mmBuffer;
2018fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2019fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2020afb2bd1cSJunchao Zhang #endif
2021afb2bd1cSJunchao Zhang };
2022ccdfe979SStefano Zampini 
2023ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2024ccdfe979SStefano Zampini {
2025ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2026ccdfe979SStefano Zampini 
2027ccdfe979SStefano Zampini   PetscFunctionBegin;
20289566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2029fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2030afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20319566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
20329566063dSJacob Faibussowitsch   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
20339566063dSJacob Faibussowitsch   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
20349566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2035b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
20369566063dSJacob Faibussowitsch   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
20379566063dSJacob Faibussowitsch   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2038b4285af6SJunchao Zhang  #endif
20399566063dSJacob Faibussowitsch   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
20409566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2041afb2bd1cSJunchao Zhang  #endif
20429566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
20439566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2044ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2045ccdfe979SStefano Zampini }
2046ccdfe979SStefano Zampini 
2047ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2048ccdfe979SStefano Zampini 
2049ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2050ccdfe979SStefano Zampini {
2051ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2052ccdfe979SStefano Zampini   Mat                          A,B;
2053afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2054ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2055ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2056ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2057ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2058ccdfe979SStefano Zampini   const PetscScalar            *barray;
2059ccdfe979SStefano Zampini   PetscScalar                  *carray;
2060ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2061ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2062ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2063ccdfe979SStefano Zampini 
2064ccdfe979SStefano Zampini   PetscFunctionBegin;
2065ccdfe979SStefano Zampini   MatCheckProduct(C,1);
206628b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2067ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2068ccdfe979SStefano Zampini   A    = product->A;
2069ccdfe979SStefano Zampini   B    = product->B;
20709566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
207128b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2072ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2073ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
207428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
20759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2076ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077ccdfe979SStefano Zampini   switch (product->type) {
2078ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2079ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2080ccdfe979SStefano Zampini     mat = cusp->mat;
2081ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2082ccdfe979SStefano Zampini     m   = A->rmap->n;
2083ccdfe979SStefano Zampini     n   = B->cmap->n;
2084ccdfe979SStefano Zampini     break;
2085ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20861a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2087e6e9a74fSStefano Zampini       mat = cusp->mat;
2088e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2089e6e9a74fSStefano Zampini     } else {
20909566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2091ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2092ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2093e6e9a74fSStefano Zampini     }
2094ccdfe979SStefano Zampini     m = A->cmap->n;
2095ccdfe979SStefano Zampini     n = B->cmap->n;
2096ccdfe979SStefano Zampini     break;
2097ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2098ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2099ccdfe979SStefano Zampini     mat = cusp->mat;
2100ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2101ccdfe979SStefano Zampini     m   = A->rmap->n;
2102ccdfe979SStefano Zampini     n   = B->rmap->n;
2103ccdfe979SStefano Zampini     break;
2104ccdfe979SStefano Zampini   default:
210598921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2106ccdfe979SStefano Zampini   }
210728b400f6SJacob Faibussowitsch   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2108ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2109ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
21109566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
21119566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
21129566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2113afb2bd1cSJunchao Zhang 
21149566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B,&blda));
2115c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
21169566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
21179566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2118c8378d12SStefano Zampini   } else {
21199566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
21209566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C,&clda));
2121c8378d12SStefano Zampini   }
2122c8378d12SStefano Zampini 
21239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2124afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2126a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2127afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2128fcdce8c4SStefano Zampini     size_t mmBufferSize;
21299566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2130afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
21319566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2132afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2133afb2bd1cSJunchao Zhang     }
2134c8378d12SStefano Zampini 
21359566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2136afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
21379566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2138afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2139afb2bd1cSJunchao Zhang     }
2140afb2bd1cSJunchao Zhang 
2141afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2142afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2143afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2144afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2145afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2146afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
21479566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2148afb2bd1cSJunchao Zhang     }
2149afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2150afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2151afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
21529566063dSJacob Faibussowitsch                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2153fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
21549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
21559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2156fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2157fcdce8c4SStefano Zampini     }
2158afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2159afb2bd1cSJunchao Zhang   } else {
2160afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
21619566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
21629566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
21639566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2164afb2bd1cSJunchao Zhang   }
2165afb2bd1cSJunchao Zhang 
2166afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2167afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2168afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2169afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
21709566063dSJacob Faibussowitsch                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2171afb2bd1cSJunchao Zhang  #else
2172afb2bd1cSJunchao Zhang   PetscInt k;
2173afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2174ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2175ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2176ccdfe979SStefano Zampini     cublasStatus_t cerr;
2177ccdfe979SStefano Zampini 
21789566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2179ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2180ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2181ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2182ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
21839566063dSJacob Faibussowitsch                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2184ccdfe979SStefano Zampini     blda = B->cmap->n;
2185afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2186afb2bd1cSJunchao Zhang   } else {
2187afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2188ccdfe979SStefano Zampini   }
2189ccdfe979SStefano Zampini 
2190afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2191ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2192afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2193ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2194ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2195ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2196ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
21979566063dSJacob Faibussowitsch                            carray,clda);PetscCallCUSPARSE(stat);
2198afb2bd1cSJunchao Zhang  #endif
21999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
22009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
22019566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2202ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
22039566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22049566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2205ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
22069566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22079566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2208ccdfe979SStefano Zampini   } else {
22099566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2210ccdfe979SStefano Zampini   }
2211ccdfe979SStefano Zampini   if (mmdata->cisdense) {
22129566063dSJacob Faibussowitsch     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2213ccdfe979SStefano Zampini   }
2214ccdfe979SStefano Zampini   if (!biscuda) {
22159566063dSJacob Faibussowitsch     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2216ccdfe979SStefano Zampini   }
2217ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2218ccdfe979SStefano Zampini }
2219ccdfe979SStefano Zampini 
2220ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2221ccdfe979SStefano Zampini {
2222ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2223ccdfe979SStefano Zampini   Mat                A,B;
2224ccdfe979SStefano Zampini   PetscInt           m,n;
2225ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2226ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2227ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2228ccdfe979SStefano Zampini 
2229ccdfe979SStefano Zampini   PetscFunctionBegin;
2230ccdfe979SStefano Zampini   MatCheckProduct(C,1);
223128b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2232ccdfe979SStefano Zampini   A    = product->A;
2233ccdfe979SStefano Zampini   B    = product->B;
22349566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
223528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2236ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
223708401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2238ccdfe979SStefano Zampini   switch (product->type) {
2239ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2240ccdfe979SStefano Zampini     m = A->rmap->n;
2241ccdfe979SStefano Zampini     n = B->cmap->n;
2242ccdfe979SStefano Zampini     break;
2243ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2244ccdfe979SStefano Zampini     m = A->cmap->n;
2245ccdfe979SStefano Zampini     n = B->cmap->n;
2246ccdfe979SStefano Zampini     break;
2247ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2248ccdfe979SStefano Zampini     m = A->rmap->n;
2249ccdfe979SStefano Zampini     n = B->rmap->n;
2250ccdfe979SStefano Zampini     break;
2251ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2252ccdfe979SStefano Zampini     m = B->cmap->n;
2253ccdfe979SStefano Zampini     n = B->cmap->n;
2254ccdfe979SStefano Zampini     break;
2255ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2256ccdfe979SStefano Zampini     m = B->rmap->n;
2257ccdfe979SStefano Zampini     n = B->rmap->n;
2258ccdfe979SStefano Zampini     break;
2259ccdfe979SStefano Zampini   default:
226098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2261ccdfe979SStefano Zampini   }
22629566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
2263ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
22649566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
22659566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2266ccdfe979SStefano Zampini 
2267ccdfe979SStefano Zampini   /* product data */
22689566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2269ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2270afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2271afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2272ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
22739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2274ccdfe979SStefano Zampini   }
2275afb2bd1cSJunchao Zhang  #endif
2276ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2277ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
22789566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
22799566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2280ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
22819566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2282ccdfe979SStefano Zampini     } else {
22839566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2284ccdfe979SStefano Zampini     }
2285ccdfe979SStefano Zampini   }
2286ccdfe979SStefano Zampini   C->product->data    = mmdata;
2287ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2288ccdfe979SStefano Zampini 
2289ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2290ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2291ccdfe979SStefano Zampini }
2292ccdfe979SStefano Zampini 
2293fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2294ccdfe979SStefano Zampini {
2295ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2296fcdce8c4SStefano Zampini   Mat                          A,B;
2297fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2298fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2299fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2300fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2301fcdce8c4SStefano Zampini   PetscBool                    flg;
2302fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2303fcdce8c4SStefano Zampini   MatProductType               ptype;
2304fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2305fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2307fcdce8c4SStefano Zampini #endif
2308b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2309ccdfe979SStefano Zampini 
2310ccdfe979SStefano Zampini   PetscFunctionBegin;
2311ccdfe979SStefano Zampini   MatCheckProduct(C,1);
231228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
23139566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
231428b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2315fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2316fcdce8c4SStefano Zampini   A = product->A;
2317fcdce8c4SStefano Zampini   B = product->B;
2318fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2319fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2320fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
232108401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2322fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
232328b400f6SJacob Faibussowitsch     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2324fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
232528b400f6SJacob Faibussowitsch     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2326fcdce8c4SStefano Zampini     goto finalize;
2327fcdce8c4SStefano Zampini   }
2328fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
23299566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
233028b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
23319566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
233228b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
233328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
233428b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2335fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2336fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2337fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
233808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
233908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
234008401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2343fcdce8c4SStefano Zampini 
2344fcdce8c4SStefano Zampini   ptype = product->type;
2345fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2346fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
234728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2348fa046f9fSJunchao Zhang   }
2349fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2350fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
235128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2352fa046f9fSJunchao Zhang   }
2353fcdce8c4SStefano Zampini   switch (ptype) {
2354fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2355fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2356fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2357fcdce8c4SStefano Zampini     break;
2358fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2359fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2360fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2361fcdce8c4SStefano Zampini     break;
2362fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2363fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2364fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2365fcdce8c4SStefano Zampini     break;
2366fcdce8c4SStefano Zampini   default:
236798921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2368fcdce8c4SStefano Zampini   }
2369fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
237028b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
237128b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
237228b400f6SJacob Faibussowitsch   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2373fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2374fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2375fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
237628b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
237728b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
237828b400f6SJacob Faibussowitsch   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
23799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2380fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2381fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
23829566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2383b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2384b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2385b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2386b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23879566063dSJacob Faibussowitsch                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2388b4285af6SJunchao Zhang   #else
2389b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2390fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2391fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23929566063dSJacob Faibussowitsch                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2393b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2394fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
23959566063dSJacob Faibussowitsch                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2396b4285af6SJunchao Zhang   #endif
2397fcdce8c4SStefano Zampini #else
2398b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2399fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2400fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2401fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
24029566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2403fcdce8c4SStefano Zampini #endif
24049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
24059566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
24069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2407fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2408fcdce8c4SStefano Zampini finalize:
2409fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
24109566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
24119566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
24129566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2413fcdce8c4SStefano Zampini   c->reallocs         = 0;
2414fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2415fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2416fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2417fcdce8c4SStefano Zampini   C->num_ass++;
2418ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2419ccdfe979SStefano Zampini }
2420fcdce8c4SStefano Zampini 
2421fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2422fcdce8c4SStefano Zampini {
2423fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2424fcdce8c4SStefano Zampini   Mat                          A,B;
2425fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2426fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2427fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2428fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2429fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2430fcdce8c4SStefano Zampini   PetscBool                    flg;
2431fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2432fcdce8c4SStefano Zampini   MatProductType               ptype;
2433fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2434fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2435fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2436fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2438fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2439fcdce8c4SStefano Zampini #else
2440fcdce8c4SStefano Zampini   int                          cnz;
2441fcdce8c4SStefano Zampini #endif
2442b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2443fcdce8c4SStefano Zampini 
2444fcdce8c4SStefano Zampini   PetscFunctionBegin;
2445fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
244628b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2447fcdce8c4SStefano Zampini   A    = product->A;
2448fcdce8c4SStefano Zampini   B    = product->B;
24499566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
245028b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
24519566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
245228b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2453fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2454fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2455fcdce8c4SStefano Zampini   /* product data */
24569566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2457fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2458fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2459fcdce8c4SStefano Zampini 
24609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
24619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2462d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2463d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
246408401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
246508401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2466d60bce21SJunchao Zhang 
2467fcdce8c4SStefano Zampini   ptype = product->type;
2468fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2469fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2470fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2471fa046f9fSJunchao Zhang   }
2472fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2473fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2474fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2475fa046f9fSJunchao Zhang   }
2476fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2477fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2478fcdce8c4SStefano Zampini   switch (ptype) {
2479fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2480fcdce8c4SStefano Zampini     m = A->rmap->n;
2481fcdce8c4SStefano Zampini     n = B->cmap->n;
2482fcdce8c4SStefano Zampini     k = A->cmap->n;
2483fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2484fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2485fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2486fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2487fcdce8c4SStefano Zampini     break;
2488fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2489fcdce8c4SStefano Zampini     m = A->cmap->n;
2490fcdce8c4SStefano Zampini     n = B->cmap->n;
2491fcdce8c4SStefano Zampini     k = A->rmap->n;
24929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2493fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2494fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2495fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2496fcdce8c4SStefano Zampini     break;
2497fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2498fcdce8c4SStefano Zampini     m = A->rmap->n;
2499fcdce8c4SStefano Zampini     n = B->rmap->n;
2500fcdce8c4SStefano Zampini     k = A->cmap->n;
25019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2502fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2503fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2504fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2505fcdce8c4SStefano Zampini     break;
2506fcdce8c4SStefano Zampini   default:
250798921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2508fcdce8c4SStefano Zampini   }
2509fcdce8c4SStefano Zampini 
2510fcdce8c4SStefano Zampini   /* create cusparse matrix */
25119566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
25129566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2513fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2514fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2515fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2516fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2517fcdce8c4SStefano Zampini 
2518fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2519fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2520fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
25219566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
25229566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2523fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2524fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2525fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2526fcdce8c4SStefano Zampini   } else {
2527fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2528fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2529fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2530fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2531fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2532fcdce8c4SStefano Zampini   }
2533fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2534fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2535fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2536fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2537fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2538fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
25399566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
25409566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
25419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
25429566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
25439566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
25449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
25459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
25469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
25479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2548fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2549fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2550fcdce8c4SStefano Zampini     c->nz = 0;
2551fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2552fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2553fcdce8c4SStefano Zampini     goto finalizesym;
2554fcdce8c4SStefano Zampini   }
2555fcdce8c4SStefano Zampini 
255628b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
255728b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2558fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2559fcdce8c4SStefano Zampini   if (!biscompressed) {
2560fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2561fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2562fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2563fcdce8c4SStefano Zampini #endif
2564fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2565fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2566fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2567fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2568fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2569fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2570fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2571fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2572fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2573fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2574fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
25759566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2576fcdce8c4SStefano Zampini     }
2577fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2578fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2579fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2580fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2581fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2582fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2583fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2584fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
25859566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2586fcdce8c4SStefano Zampini     }
2587fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2588fcdce8c4SStefano Zampini #endif
2589fcdce8c4SStefano Zampini   }
259028b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
259128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2592fcdce8c4SStefano Zampini   /* precompute flops count */
2593fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2594fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2595fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2596fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2597fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2598fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2599fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2600fcdce8c4SStefano Zampini       }
2601fcdce8c4SStefano Zampini     }
2602fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2603fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2604fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2605fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2606fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2607fcdce8c4SStefano Zampini     }
2608fcdce8c4SStefano Zampini   } else { /* TODO */
2609fcdce8c4SStefano Zampini     flops = 0.;
2610fcdce8c4SStefano Zampini   }
2611fcdce8c4SStefano Zampini 
2612fcdce8c4SStefano Zampini   mmdata->flops = flops;
26139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2614b4285af6SJunchao Zhang 
2615fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
26169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2617fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2618fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2619fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
26209566063dSJacob Faibussowitsch                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
26219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2622b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2623b4285af6SJunchao Zhang  {
2624b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2625b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2626b4285af6SJunchao Zhang   */
2627b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2628b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2629b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2630b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2631b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2632b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2633b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2634b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2635b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2636b4285af6SJunchao Zhang 
2637b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2638b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2639b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2640b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26419566063dSJacob Faibussowitsch                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
26429566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2643b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2644b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2645b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26469566063dSJacob Faibussowitsch                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2647b4285af6SJunchao Zhang 
2648b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2649b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2650b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26519566063dSJacob Faibussowitsch                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
26529566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
26539566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
26549566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2655b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2656b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26579566063dSJacob Faibussowitsch                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
26589566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer1));
26599566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer2));
2660b4285af6SJunchao Zhang 
2661b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2662b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
26639566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2664b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2665b4285af6SJunchao Zhang   /* allocate matrix C */
26669566063dSJacob Faibussowitsch   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
26679566063dSJacob Faibussowitsch   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2668b4285af6SJunchao Zhang   /* update matC with the new pointers */
2669b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
26709566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2671b4285af6SJunchao Zhang 
2672b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2673b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2674b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26759566063dSJacob Faibussowitsch                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
26769566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2677b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2678b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26799566063dSJacob Faibussowitsch                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
26809566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer3));
2681b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2682b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26849566063dSJacob Faibussowitsch                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
26859566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2686b4285af6SJunchao Zhang  }
2687ae37ee31SJunchao Zhang  #else
2688b4285af6SJunchao Zhang   size_t bufSize2;
2689fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2690b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2691fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2692fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26939566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
26949566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2695fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2696b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2697fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2698fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26999566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2700fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2701b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2702fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2703fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27049566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2705fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2706fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2707fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2708fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2709fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
27109566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2711fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2712b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2713fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2714fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27159566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2716fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
27179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2718fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
27199566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2720fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27219566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2722fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27239566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2724fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
27259566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2726b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2727fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
27289566063dSJacob Faibussowitsch                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2729ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2730fcdce8c4SStefano Zampini #else
27319566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2732b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2733fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2734fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2735fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27369566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2737fcdce8c4SStefano Zampini   c->nz = cnz;
2738fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27399566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2740fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27419566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2742fcdce8c4SStefano Zampini 
27439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2744fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2745fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2746fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2747b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2748fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2749fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2750fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27519566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2752fcdce8c4SStefano Zampini #endif
27539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2755fcdce8c4SStefano Zampini finalizesym:
2756fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2757fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2758fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
27599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m+1,&c->i));
27609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->j));
2761fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2762fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2763fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2764fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2765fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2766fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2767fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2770fcdce8c4SStefano Zampini   } else {
2771fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2772fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2775fcdce8c4SStefano Zampini   }
2776fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2777fcdce8c4SStefano Zampini     PetscInt r = 0;
2778fcdce8c4SStefano Zampini     c->i[0] = 0;
2779fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2780fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2781fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2782fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2783fcdce8c4SStefano Zampini     }
2784fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2785fcdce8c4SStefano Zampini   }
27869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
27879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->ilen));
27889566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->imax));
2789fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2790fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2791fcdce8c4SStefano Zampini   c->rmax = 0;
2792fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2793fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2794fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2795fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2796fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2797fcdce8c4SStefano Zampini   }
27989566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
27999566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->a));
2800fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2801fcdce8c4SStefano Zampini 
2802fcdce8c4SStefano Zampini   C->nonzerostate++;
28039566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
28049566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
2805fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2806fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2807fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2808fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2809fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2810abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2811fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2812fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2813fcdce8c4SStefano Zampini   }
2814fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2815fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2816fcdce8c4SStefano Zampini }
2817fcdce8c4SStefano Zampini 
2818fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2819fcdce8c4SStefano Zampini 
2820fcdce8c4SStefano Zampini /* handles sparse or dense B */
2821fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2822fcdce8c4SStefano Zampini {
2823fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2824fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2825fcdce8c4SStefano Zampini 
2826fcdce8c4SStefano Zampini   PetscFunctionBegin;
2827fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
28289566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2829abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
28309566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2831fcdce8c4SStefano Zampini   }
2832fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2833fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2834fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
28359566063dSJacob Faibussowitsch       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2836fcdce8c4SStefano Zampini     }
2837fcdce8c4SStefano Zampini   }
283865e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
283965e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
284065e4b4d4SStefano Zampini     switch (product->type) {
284165e4b4d4SStefano Zampini     case MATPRODUCT_AB:
284265e4b4d4SStefano Zampini       if (product->api_user) {
2843d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
28449566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2845d0609cedSBarry Smith         PetscOptionsEnd();
284665e4b4d4SStefano Zampini       } else {
2847d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
28489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2849d0609cedSBarry Smith         PetscOptionsEnd();
285065e4b4d4SStefano Zampini       }
285165e4b4d4SStefano Zampini       break;
285265e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
285365e4b4d4SStefano Zampini       if (product->api_user) {
2854d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
28559566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2856d0609cedSBarry Smith         PetscOptionsEnd();
285765e4b4d4SStefano Zampini       } else {
2858d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
28599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2860d0609cedSBarry Smith         PetscOptionsEnd();
286165e4b4d4SStefano Zampini       }
286265e4b4d4SStefano Zampini       break;
286365e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
286465e4b4d4SStefano Zampini       if (product->api_user) {
2865d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
28669566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2867d0609cedSBarry Smith         PetscOptionsEnd();
286865e4b4d4SStefano Zampini       } else {
2869d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
28709566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2871d0609cedSBarry Smith         PetscOptionsEnd();
287265e4b4d4SStefano Zampini       }
287365e4b4d4SStefano Zampini       break;
287465e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
287565e4b4d4SStefano Zampini       if (product->api_user) {
2876d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
28779566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2878d0609cedSBarry Smith         PetscOptionsEnd();
287965e4b4d4SStefano Zampini       } else {
2880d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
28819566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2882d0609cedSBarry Smith         PetscOptionsEnd();
288365e4b4d4SStefano Zampini       }
288465e4b4d4SStefano Zampini       break;
288565e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
288665e4b4d4SStefano Zampini       if (product->api_user) {
2887d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
28889566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2889d0609cedSBarry Smith         PetscOptionsEnd();
289065e4b4d4SStefano Zampini       } else {
2891d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
28929566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2893d0609cedSBarry Smith         PetscOptionsEnd();
289465e4b4d4SStefano Zampini       }
289565e4b4d4SStefano Zampini       break;
289665e4b4d4SStefano Zampini     default:
289765e4b4d4SStefano Zampini       break;
289865e4b4d4SStefano Zampini     }
289965e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
290065e4b4d4SStefano Zampini   }
290165e4b4d4SStefano Zampini   /* dispatch */
2902fcdce8c4SStefano Zampini   if (isdense) {
2903ccdfe979SStefano Zampini     switch (product->type) {
2904ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2905ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2906ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2907ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2908ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2909fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
29109566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2911fcdce8c4SStefano Zampini       } else {
2912fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2913fcdce8c4SStefano Zampini       }
2914fcdce8c4SStefano Zampini       break;
2915fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2916fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2917fcdce8c4SStefano Zampini       break;
2918ccdfe979SStefano Zampini     default:
2919ccdfe979SStefano Zampini       break;
2920ccdfe979SStefano Zampini     }
2921fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2922fcdce8c4SStefano Zampini     switch (product->type) {
2923fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2924fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2925fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2926fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2927fcdce8c4SStefano Zampini       break;
2928fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2929fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2930fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2931fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2932fcdce8c4SStefano Zampini       break;
2933fcdce8c4SStefano Zampini     default:
2934fcdce8c4SStefano Zampini       break;
2935fcdce8c4SStefano Zampini     }
2936fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
29379566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2938fcdce8c4SStefano Zampini   }
2939ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2940ccdfe979SStefano Zampini }
2941ccdfe979SStefano Zampini 
29426fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29439ae82921SPaul Mullowney {
29449ae82921SPaul Mullowney   PetscFunctionBegin;
29459566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2946e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2947e6e9a74fSStefano Zampini }
2948e6e9a74fSStefano Zampini 
2949e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2950e6e9a74fSStefano Zampini {
2951e6e9a74fSStefano Zampini   PetscFunctionBegin;
29529566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2953e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2954e6e9a74fSStefano Zampini }
2955e6e9a74fSStefano Zampini 
2956e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2957e6e9a74fSStefano Zampini {
2958e6e9a74fSStefano Zampini   PetscFunctionBegin;
29599566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2960e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2961e6e9a74fSStefano Zampini }
2962e6e9a74fSStefano Zampini 
2963e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2964e6e9a74fSStefano Zampini {
2965e6e9a74fSStefano Zampini   PetscFunctionBegin;
29669566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
29679ae82921SPaul Mullowney   PetscFunctionReturn(0);
29689ae82921SPaul Mullowney }
29699ae82921SPaul Mullowney 
29706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2971ca45077fSPaul Mullowney {
2972ca45077fSPaul Mullowney   PetscFunctionBegin;
29739566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2974ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2975ca45077fSPaul Mullowney }
2976ca45077fSPaul Mullowney 
2977a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2978a0e72f99SJunchao Zhang {
2979a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2980a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2981a0e72f99SJunchao Zhang }
2982a0e72f99SJunchao Zhang 
2983afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2984e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29859ae82921SPaul Mullowney {
29869ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2987aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
29889ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2989e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2990e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2991e6e9a74fSStefano Zampini   PetscBool                    compressed;
2992afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2993afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2994afb2bd1cSJunchao Zhang #endif
29956e111a19SKarl Rupp 
29969ae82921SPaul Mullowney   PetscFunctionBegin;
299708401ef6SPierre Jolivet   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2998cbc6b225SStefano Zampini   if (!a->nz) {
29999566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
30009566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3001e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3002e6e9a74fSStefano Zampini   }
300334d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
30049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3005e6e9a74fSStefano Zampini   if (!trans) {
30069ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
30075f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3008e6e9a74fSStefano Zampini   } else {
30091a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3010e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3011e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3012e6e9a74fSStefano Zampini     } else {
30139566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3014e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3015e6e9a74fSStefano Zampini     }
3016e6e9a74fSStefano Zampini   }
3017e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3018e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3019213423ffSJunchao Zhang 
3020e6e9a74fSStefano Zampini   try {
30219566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
30229566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
30239566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3024afb2bd1cSJunchao Zhang 
30259566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3026e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3027afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3028afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3029afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3030afb2bd1cSJunchao Zhang       */
3031e6e9a74fSStefano Zampini       xptr = xarray;
3032afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3033213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3034afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3035afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3036afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3037afb2bd1cSJunchao Zhang        */
3038afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3039afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3040afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3041afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3042afb2bd1cSJunchao Zhang       }
3043afb2bd1cSJunchao Zhang      #endif
3044e6e9a74fSStefano Zampini     } else {
3045afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3046afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3047afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3048afb2bd1cSJunchao Zhang        */
3049afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3050e6e9a74fSStefano Zampini       dptr = zarray;
3051e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3052afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3053e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3054a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3055e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3056e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3057e6e9a74fSStefano Zampini       }
3058afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3059afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3060afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3061afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3062afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3063afb2bd1cSJunchao Zhang       }
3064afb2bd1cSJunchao Zhang      #endif
3065e6e9a74fSStefano Zampini     }
30669ae82921SPaul Mullowney 
3067afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3068aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3069afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
30705f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3071afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
30729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
30739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
30749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3075afb2bd1cSJunchao Zhang                                                matstruct->matDescr,
3076afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3077afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecYDescr,
3078afb2bd1cSJunchao Zhang                                                cusparse_scalartype,
3079afb2bd1cSJunchao Zhang                                                cusparsestruct->spmvAlg,
30805f80ce2aSJacob Faibussowitsch                                                &matstruct->cuSpMV[opA].spmvBufferSize));
30819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3082afb2bd1cSJunchao Zhang 
3083afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3084afb2bd1cSJunchao Zhang       } else {
3085afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
30869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
30879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3088afb2bd1cSJunchao Zhang       }
3089afb2bd1cSJunchao Zhang 
30909566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3091afb2bd1cSJunchao Zhang                                   matstruct->alpha_one,
30923606e59fSJunchao Zhang                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3093afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecXDescr,
3094afb2bd1cSJunchao Zhang                                   beta,
3095afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecYDescr,
3096afb2bd1cSJunchao Zhang                                   cusparse_scalartype,
3097afb2bd1cSJunchao Zhang                                   cusparsestruct->spmvAlg,
30985f80ce2aSJacob Faibussowitsch                                   matstruct->cuSpMV[opA].spmvBuffer));
3099afb2bd1cSJunchao Zhang      #else
31007656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
31019566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3102a65300a6SPaul Mullowney                                        mat->num_rows, mat->num_cols,
3103afb2bd1cSJunchao Zhang                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3104aa372e3fSPaul Mullowney                                        mat->values->data().get(), mat->row_offsets->data().get(),
3105e6e9a74fSStefano Zampini                                        mat->column_indices->data().get(), xptr, beta,
31065f80ce2aSJacob Faibussowitsch                                        dptr));
3107afb2bd1cSJunchao Zhang      #endif
3108aa372e3fSPaul Mullowney     } else {
3109213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3110afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3111afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3112afb2bd1cSJunchao Zhang        #else
3113301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
31149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3115afb2bd1cSJunchao Zhang                                          matstruct->alpha_one, matstruct->descr, hybMat,
3116e6e9a74fSStefano Zampini                                          xptr, beta,
31175f80ce2aSJacob Faibussowitsch                                          dptr));
3118afb2bd1cSJunchao Zhang        #endif
3119a65300a6SPaul Mullowney       }
3120aa372e3fSPaul Mullowney     }
31219566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3122aa372e3fSPaul Mullowney 
3123e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3124213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3125213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
31269566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3127e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
31289566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
31297656d835SStefano Zampini         }
3130213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
31319566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz,0));
31327656d835SStefano Zampini       }
31337656d835SStefano Zampini 
3134213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3135213423ffSJunchao Zhang       if (compressed) {
31369566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3137a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3138a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3139a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3140a0e72f99SJunchao Zhang          */
3141a0e72f99SJunchao Zhang        #if 0
3142a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3143a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3144a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3145e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3146c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3147a0e72f99SJunchao Zhang        #else
3148a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3149a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3150a0e72f99SJunchao Zhang        #endif
31519566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3152e6e9a74fSStefano Zampini       }
3153e6e9a74fSStefano Zampini     } else {
3154e6e9a74fSStefano Zampini       if (yy && yy != zz) {
31559566063dSJacob Faibussowitsch         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3156e6e9a74fSStefano Zampini       }
3157e6e9a74fSStefano Zampini     }
31589566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
31599566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
31609566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
31619ae82921SPaul Mullowney   } catch(char *ex) {
316298921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31639ae82921SPaul Mullowney   }
3164e6e9a74fSStefano Zampini   if (yy) {
31659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3166e6e9a74fSStefano Zampini   } else {
31679566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3168e6e9a74fSStefano Zampini   }
31699ae82921SPaul Mullowney   PetscFunctionReturn(0);
31709ae82921SPaul Mullowney }
31719ae82921SPaul Mullowney 
31726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3173ca45077fSPaul Mullowney {
3174ca45077fSPaul Mullowney   PetscFunctionBegin;
31759566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3176ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3177ca45077fSPaul Mullowney }
3178ca45077fSPaul Mullowney 
31796fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31809ae82921SPaul Mullowney {
3181042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3182042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31833fa6b06aSMark Adams 
3184042217e8SBarry Smith   PetscFunctionBegin;
31859566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3186042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3187042217e8SBarry Smith 
31889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
31899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3190042217e8SBarry Smith     cusp->deviceMat = NULL;
3191042217e8SBarry Smith   }
31929ae82921SPaul Mullowney   PetscFunctionReturn(0);
31939ae82921SPaul Mullowney }
31949ae82921SPaul Mullowney 
31959ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3196e057df02SPaul Mullowney /*@
31979ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3198e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3199e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3200e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3201e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3202e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32039ae82921SPaul Mullowney 
3204d083f849SBarry Smith    Collective
32059ae82921SPaul Mullowney 
32069ae82921SPaul Mullowney    Input Parameters:
32079ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32089ae82921SPaul Mullowney .  m - number of rows
32099ae82921SPaul Mullowney .  n - number of columns
32109ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32119ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32120298fd71SBarry Smith          (possibly different for each row) or NULL
32139ae82921SPaul Mullowney 
32149ae82921SPaul Mullowney    Output Parameter:
32159ae82921SPaul Mullowney .  A - the matrix
32169ae82921SPaul Mullowney 
32179ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32189ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32199ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32209ae82921SPaul Mullowney 
32219ae82921SPaul Mullowney    Notes:
32229ae82921SPaul Mullowney    If nnz is given then nz is ignored
32239ae82921SPaul Mullowney 
32249ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32259ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32269ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32279ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32289ae82921SPaul Mullowney 
32299ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32300298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32319ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32329ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32339ae82921SPaul Mullowney 
32349ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32359ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32369ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32379ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32389ae82921SPaul Mullowney 
32399ae82921SPaul Mullowney    Level: intermediate
32409ae82921SPaul Mullowney 
3241db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
32429ae82921SPaul Mullowney @*/
32439ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32449ae82921SPaul Mullowney {
32459ae82921SPaul Mullowney   PetscFunctionBegin;
32469566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
32479566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
32489566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
32499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
32509ae82921SPaul Mullowney   PetscFunctionReturn(0);
32519ae82921SPaul Mullowney }
32529ae82921SPaul Mullowney 
32536fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32549ae82921SPaul Mullowney {
32559ae82921SPaul Mullowney   PetscFunctionBegin;
32569ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
32579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
32589ae82921SPaul Mullowney   } else {
32599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3260aa372e3fSPaul Mullowney   }
32619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
32629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
32639566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
32649566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
32659566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
32669566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
32679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
32689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
32699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
32709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
32719566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
32729ae82921SPaul Mullowney   PetscFunctionReturn(0);
32739ae82921SPaul Mullowney }
32749ae82921SPaul Mullowney 
3275ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
327695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32779ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32789ff858a8SKarl Rupp {
32799ff858a8SKarl Rupp   PetscFunctionBegin;
32809566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
32819566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
32829ff858a8SKarl Rupp   PetscFunctionReturn(0);
32839ff858a8SKarl Rupp }
32849ff858a8SKarl Rupp 
3285039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
328695639643SRichard Tran Mills {
3287a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3288039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3289039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3290039c6fbaSStefano Zampini   PetscScalar        *ay;
3291039c6fbaSStefano Zampini   const PetscScalar  *ax;
3292039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3293e6e9a74fSStefano Zampini 
329495639643SRichard Tran Mills   PetscFunctionBegin;
3295a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3296a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3297039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
32989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
32999566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3300a587d139SMark     PetscFunctionReturn(0);
330195639643SRichard Tran Mills   }
3302039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
33039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
33049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
33055f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
33065f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3307039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3308039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3309039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3310039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3311039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3312039c6fbaSStefano Zampini     if (eq) {
3313039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3314039c6fbaSStefano Zampini     }
3315039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3316039c6fbaSStefano Zampini   }
3317d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3318d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3319039c6fbaSStefano Zampini 
3320039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3321039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3322039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3323039c6fbaSStefano Zampini     size_t      bufferSize;
3324039c6fbaSStefano Zampini     void        *buffer;
3325039c6fbaSStefano Zampini #endif
3326039c6fbaSStefano Zampini 
33279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33299566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3330039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33319566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3332039c6fbaSStefano Zampini                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3333039c6fbaSStefano Zampini                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33345f80ce2aSJacob Faibussowitsch                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
33359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
33369566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33379566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3338039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3339039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33405f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
33419566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33429566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3344039c6fbaSStefano Zampini #else
33459566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33469566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3347039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3348039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33495f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
33509566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33519566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3352039c6fbaSStefano Zampini #endif
33539566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
33549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3357039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3358a587d139SMark     cublasHandle_t cublasv2handle;
3359a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3360039c6fbaSStefano Zampini 
33619566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33639566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33649566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz,&bnz));
33659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33669566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
33679566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*bnz));
33689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3372039c6fbaSStefano Zampini   } else {
33739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33749566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3375a587d139SMark   }
337695639643SRichard Tran Mills   PetscFunctionReturn(0);
337795639643SRichard Tran Mills }
337895639643SRichard Tran Mills 
337933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
338033c9ba73SStefano Zampini {
338133c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
338233c9ba73SStefano Zampini   PetscScalar    *ay;
338333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
338433c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
338533c9ba73SStefano Zampini 
338633c9ba73SStefano Zampini   PetscFunctionBegin;
33879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33889566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33899566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz,&bnz));
33909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
33919566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
33929566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
33939566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
33949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
339633c9ba73SStefano Zampini   PetscFunctionReturn(0);
339733c9ba73SStefano Zampini }
339833c9ba73SStefano Zampini 
33993fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34003fa6b06aSMark Adams {
34017e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3402a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34037e8381f9SStefano Zampini 
34043fa6b06aSMark Adams   PetscFunctionBegin;
34053fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34063fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34077e8381f9SStefano Zampini     if (spptr->mat) {
34087e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34097e8381f9SStefano Zampini       if (matrix->values) {
34107e8381f9SStefano Zampini         both = PETSC_TRUE;
34117e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34127e8381f9SStefano Zampini       }
34137e8381f9SStefano Zampini     }
34147e8381f9SStefano Zampini     if (spptr->matTranspose) {
34157e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34167e8381f9SStefano Zampini       if (matrix->values) {
34177e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34187e8381f9SStefano Zampini       }
34197e8381f9SStefano Zampini     }
34203fa6b06aSMark Adams   }
34219566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
34229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
34237e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34253fa6b06aSMark Adams   PetscFunctionReturn(0);
34263fa6b06aSMark Adams }
34273fa6b06aSMark Adams 
3428a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429a587d139SMark {
3430a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3431a587d139SMark 
3432a587d139SMark   PetscFunctionBegin;
34339a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
34349a14fc28SStefano Zampini     A->boundtocpu = flg;
34359a14fc28SStefano Zampini     PetscFunctionReturn(0);
34369a14fc28SStefano Zampini   }
3437a587d139SMark   if (flg) {
34389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3439a587d139SMark 
344033c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3441a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3442a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3443a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3444a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3445a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3446a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3447a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3448a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3449fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
34509566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
34519566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
34529566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
34539566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
34549566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
34559566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
34569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3457a587d139SMark   } else {
345833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3459a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3460a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3461a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3462a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3463a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3464a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3465a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3466a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3467fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
346867a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
346967a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
347067a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
347167a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
347267a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
347367a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
34747ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
34757ee59b9bSJunchao Zhang 
34769566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
34779566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34789566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34799566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
34809566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
34819566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3482a587d139SMark    }
3483a587d139SMark   A->boundtocpu = flg;
3484ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3485ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3486ea500dcfSRichard Tran Mills   } else {
3487ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3488ea500dcfSRichard Tran Mills   }
3489a587d139SMark   PetscFunctionReturn(0);
3490a587d139SMark }
3491a587d139SMark 
349249735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
34939ae82921SPaul Mullowney {
349449735bf3SStefano Zampini   Mat              B;
34959ae82921SPaul Mullowney 
34969ae82921SPaul Mullowney   PetscFunctionBegin;
34979566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
349849735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
34999566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
350049735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
35019566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
350249735bf3SStefano Zampini   }
350349735bf3SStefano Zampini   B = *newmat;
350449735bf3SStefano Zampini 
35059566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
35069566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
350734136279SStefano Zampini 
350849735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
35099ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3510e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
35119566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35129566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35139566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
35141a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3515d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3516ba986b86SSatish Balay      #if CUSPARSE_VERSION > 11301
3517a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3518a435da06SStefano Zampini      #else
3519d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3520a435da06SStefano Zampini      #endif
3521d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3522d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3523d8132acaSStefano Zampini      #endif
35241a2c6b5cSJunchao Zhang       B->spptr = spptr;
35259ae82921SPaul Mullowney     } else {
3526e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3527e6e9a74fSStefano Zampini 
35289566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35299566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35309566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3531e6e9a74fSStefano Zampini       B->spptr = spptr;
35329ae82921SPaul Mullowney     }
3533e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
353449735bf3SStefano Zampini   }
3535693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35369ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35371a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35389ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
353995639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3540693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35412205254eSKarl Rupp 
35429566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
35439566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
35449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3545ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
35469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3547ae48a8d0SStefano Zampini #endif
35489566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
35499ae82921SPaul Mullowney   PetscFunctionReturn(0);
35509ae82921SPaul Mullowney }
35519ae82921SPaul Mullowney 
355202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
355302fe1965SBarry Smith {
355402fe1965SBarry Smith   PetscFunctionBegin;
35559566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
35569566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
355702fe1965SBarry Smith   PetscFunctionReturn(0);
355802fe1965SBarry Smith }
355902fe1965SBarry Smith 
35603ca39a21SBarry Smith /*MC
3561e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3562e057df02SPaul Mullowney 
3563e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35642692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35652692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3566e057df02SPaul Mullowney 
3567e057df02SPaul Mullowney    Options Database Keys:
3568e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3569aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3570a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3571365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3572e057df02SPaul Mullowney 
3573e057df02SPaul Mullowney   Level: beginner
3574e057df02SPaul Mullowney 
3575db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3576e057df02SPaul Mullowney M*/
35777f756511SDominic Meiser 
3578bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35790f39cd5aSBarry Smith 
35803ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
358142c9c57cSBarry Smith {
358242c9c57cSBarry Smith   PetscFunctionBegin;
35839566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
35849566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
35859566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
35869566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
35879566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3588bddcd29dSMark Adams 
358942c9c57cSBarry Smith   PetscFunctionReturn(0);
359042c9c57cSBarry Smith }
359129b38603SBarry Smith 
3592cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3593cbc6b225SStefano Zampini {
3594cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3595cbc6b225SStefano Zampini 
3596cbc6b225SStefano Zampini   PetscFunctionBegin;
3597cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3598cbc6b225SStefano Zampini   delete cusp->cooPerm;
3599cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3600cbc6b225SStefano Zampini   cusp->cooPerm = NULL;
3601cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3602cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
36039566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
36049566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3605cbc6b225SStefano Zampini   }
3606cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3607cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3608cbc6b225SStefano Zampini }
3609cbc6b225SStefano Zampini 
3610470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36117f756511SDominic Meiser {
36127f756511SDominic Meiser   PetscFunctionBegin;
36137f756511SDominic Meiser   if (*cusparsestruct) {
36149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
36159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
36167f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
361781902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
36187e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
36197e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3620a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
36219566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
36229566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
36239566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
36249566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
36257f756511SDominic Meiser   }
36267f756511SDominic Meiser   PetscFunctionReturn(0);
36277f756511SDominic Meiser }
36287f756511SDominic Meiser 
36297f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36307f756511SDominic Meiser {
36317f756511SDominic Meiser   PetscFunctionBegin;
36327f756511SDominic Meiser   if (*mat) {
36337f756511SDominic Meiser     delete (*mat)->values;
36347f756511SDominic Meiser     delete (*mat)->column_indices;
36357f756511SDominic Meiser     delete (*mat)->row_offsets;
36367f756511SDominic Meiser     delete *mat;
36377f756511SDominic Meiser     *mat = 0;
36387f756511SDominic Meiser   }
36397f756511SDominic Meiser   PetscFunctionReturn(0);
36407f756511SDominic Meiser }
36417f756511SDominic Meiser 
3642470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36437f756511SDominic Meiser {
36447f756511SDominic Meiser   PetscFunctionBegin;
36457f756511SDominic Meiser   if (*trifactor) {
36469566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
36479566063dSJacob Faibussowitsch     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
36489566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
36499566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
36509566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3651afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36529566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3653afb2bd1cSJunchao Zhang    #endif
36549566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
36557f756511SDominic Meiser   }
36567f756511SDominic Meiser   PetscFunctionReturn(0);
36577f756511SDominic Meiser }
36587f756511SDominic Meiser 
3659470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36607f756511SDominic Meiser {
36617f756511SDominic Meiser   CsrMatrix        *mat;
36627f756511SDominic Meiser 
36637f756511SDominic Meiser   PetscFunctionBegin;
36647f756511SDominic Meiser   if (*matstruct) {
36657f756511SDominic Meiser     if ((*matstruct)->mat) {
36667f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3667afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3668afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3669afb2bd1cSJunchao Zhang        #else
36707f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
36719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3672afb2bd1cSJunchao Zhang        #endif
36737f756511SDominic Meiser       } else {
36747f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36757f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36767f756511SDominic Meiser       }
36777f756511SDominic Meiser     }
36789566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
36797f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
36809566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
36819566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
36829566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3683afb2bd1cSJunchao Zhang 
3684afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3685afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
36869566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3687afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3688afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
36899566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
36909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
36919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3692afb2bd1cSJunchao Zhang       }
3693afb2bd1cSJunchao Zhang     }
3694afb2bd1cSJunchao Zhang    #endif
36957f756511SDominic Meiser     delete *matstruct;
36967e8381f9SStefano Zampini     *matstruct = NULL;
36977f756511SDominic Meiser   }
36987f756511SDominic Meiser   PetscFunctionReturn(0);
36997f756511SDominic Meiser }
37007f756511SDominic Meiser 
3701e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37027f756511SDominic Meiser {
37037f756511SDominic Meiser   PetscFunctionBegin;
37047f756511SDominic Meiser   if (*trifactors) {
37059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
37069566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
37079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
37089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
37097f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
37107f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
37117f756511SDominic Meiser     delete (*trifactors)->workVector;
37127e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
37137e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
37147e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
37159566063dSJacob Faibussowitsch     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
37169566063dSJacob Faibussowitsch     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3717e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3718ccdfe979SStefano Zampini   }
3719ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3720ccdfe979SStefano Zampini }
3721ccdfe979SStefano Zampini 
3722ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3723ccdfe979SStefano Zampini {
3724ccdfe979SStefano Zampini   cusparseHandle_t handle;
3725ccdfe979SStefano Zampini 
3726ccdfe979SStefano Zampini   PetscFunctionBegin;
3727ccdfe979SStefano Zampini   if (*trifactors) {
37289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
37297f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
37309566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseDestroy(handle));
37317f756511SDominic Meiser     }
37329566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
37337f756511SDominic Meiser   }
37347f756511SDominic Meiser   PetscFunctionReturn(0);
37357f756511SDominic Meiser }
37367e8381f9SStefano Zampini 
37377e8381f9SStefano Zampini struct IJCompare
37387e8381f9SStefano Zampini {
37397e8381f9SStefano Zampini   __host__ __device__
37407e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37417e8381f9SStefano Zampini   {
37427e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37437e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37447e8381f9SStefano Zampini     return false;
37457e8381f9SStefano Zampini   }
37467e8381f9SStefano Zampini };
37477e8381f9SStefano Zampini 
37487e8381f9SStefano Zampini struct IJEqual
37497e8381f9SStefano Zampini {
37507e8381f9SStefano Zampini   __host__ __device__
37517e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37527e8381f9SStefano Zampini   {
37537e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37547e8381f9SStefano Zampini     return true;
37557e8381f9SStefano Zampini   }
37567e8381f9SStefano Zampini };
37577e8381f9SStefano Zampini 
37587e8381f9SStefano Zampini struct IJDiff
37597e8381f9SStefano Zampini {
37607e8381f9SStefano Zampini   __host__ __device__
37617e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37627e8381f9SStefano Zampini   {
37637e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37647e8381f9SStefano Zampini   }
37657e8381f9SStefano Zampini };
37667e8381f9SStefano Zampini 
37677e8381f9SStefano Zampini struct IJSum
37687e8381f9SStefano Zampini {
37697e8381f9SStefano Zampini   __host__ __device__
37707e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37717e8381f9SStefano Zampini   {
37727e8381f9SStefano Zampini     return t1||t2;
37737e8381f9SStefano Zampini   }
37747e8381f9SStefano Zampini };
37757e8381f9SStefano Zampini 
37767e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3777219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3778219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
37797e8381f9SStefano Zampini {
37807e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3782bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
378308391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37847e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37857e8381f9SStefano Zampini   PetscInt                              n;
37867e8381f9SStefano Zampini 
37877e8381f9SStefano Zampini   PetscFunctionBegin;
378828b400f6SJacob Faibussowitsch   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
378928b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
37907e8381f9SStefano Zampini   if (!cusp->cooPerm) {
37919566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
37929566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
37937e8381f9SStefano Zampini     PetscFunctionReturn(0);
37947e8381f9SStefano Zampini   }
37957e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
379628b400f6SJacob Faibussowitsch   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3797e61fc153SStefano Zampini   if (!v) {
3798e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3799e61fc153SStefano Zampini     goto finalize;
38007e8381f9SStefano Zampini   }
3801e61fc153SStefano Zampini   n = cusp->cooPerm->size();
380208391a17SStefano Zampini   if (isCudaMem(v)) {
380308391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
380408391a17SStefano Zampini   } else {
3805e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3806e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
380708391a17SStefano Zampini     d_v = cooPerm_v->data();
38089566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
380908391a17SStefano Zampini   }
38109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3811e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3812ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3813bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
381408391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3815ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3816ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3817ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3818ddea5d60SJunchao Zhang       */
3819e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3820e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3821e61fc153SStefano Zampini       delete cooPerm_w;
38227e8381f9SStefano Zampini     } else {
3823ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
382408391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38257e8381f9SStefano Zampini                                                                 matrix->values->begin()));
382608391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38277e8381f9SStefano Zampini                                                                 matrix->values->end()));
3828ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38297e8381f9SStefano Zampini     }
38307e8381f9SStefano Zampini   } else {
3831e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
383208391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3833e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38347e8381f9SStefano Zampini     } else {
383508391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38367e8381f9SStefano Zampini                                                                 matrix->values->begin()));
383708391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38387e8381f9SStefano Zampini                                                                 matrix->values->end()));
38397e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38407e8381f9SStefano Zampini     }
38417e8381f9SStefano Zampini   }
38429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3843e61fc153SStefano Zampini finalize:
3844e61fc153SStefano Zampini   delete cooPerm_v;
38457e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
38469566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3847fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
38489566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
38499566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
38509566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3851fcdce8c4SStefano Zampini   a->reallocs         = 0;
3852fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3853fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3854fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3855fcdce8c4SStefano Zampini   A->num_ass++;
38567e8381f9SStefano Zampini   PetscFunctionReturn(0);
38577e8381f9SStefano Zampini }
38587e8381f9SStefano Zampini 
3859a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3860a49f1ed0SStefano Zampini {
3861a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3862a49f1ed0SStefano Zampini 
3863a49f1ed0SStefano Zampini   PetscFunctionBegin;
3864a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3865a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3866a49f1ed0SStefano Zampini   if (destroy) {
38679566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3868a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3869a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3870a49f1ed0SStefano Zampini   }
38711a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3872a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3873a49f1ed0SStefano Zampini }
3874a49f1ed0SStefano Zampini 
38757e8381f9SStefano Zampini #include <thrust/binary_search.h>
3876219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3877219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
38787e8381f9SStefano Zampini {
38797e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38807e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38817e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38827e8381f9SStefano Zampini 
38837e8381f9SStefano Zampini   PetscFunctionBegin;
38849566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
38859566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
38867e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
38877e8381f9SStefano Zampini   if (n != cooPerm_n) {
38887e8381f9SStefano Zampini     delete cusp->cooPerm;
38897e8381f9SStefano Zampini     delete cusp->cooPerm_a;
38907e8381f9SStefano Zampini     cusp->cooPerm = NULL;
38917e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
38927e8381f9SStefano Zampini   }
38937e8381f9SStefano Zampini   if (n) {
38947e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
38957e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
38967e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
38977e8381f9SStefano Zampini 
38987e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
38997e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39007e8381f9SStefano Zampini 
39019566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
39027e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39037e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3904ddea5d60SJunchao Zhang 
3905ddea5d60SJunchao Zhang     /* Ex.
3906ddea5d60SJunchao Zhang       n = 6
3907ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3908ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3909ddea5d60SJunchao Zhang     */
39107e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
39117e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
39127e8381f9SStefano Zampini 
39139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39147e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3915ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3916ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39177e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39187e8381f9SStefano Zampini 
3919ddea5d60SJunchao Zhang     /*
3920ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3921ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3922ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3923ddea5d60SJunchao Zhang     */
3924ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3925ddea5d60SJunchao Zhang 
3926ddea5d60SJunchao Zhang     /*
3927ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3928ddea5d60SJunchao Zhang                             ^ekey
3929ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3930ddea5d60SJunchao Zhang                            ^nekye
3931ddea5d60SJunchao Zhang     */
39327e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39337e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39347e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3935ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3936ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3937ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3938ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3939ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39407e8381f9SStefano Zampini       w[0] = 0;
3941ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3942ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39437e8381f9SStefano Zampini     }
39447e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3945ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3946ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3947ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
39489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39497e8381f9SStefano Zampini 
39509566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
39517e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39527e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39537e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39549566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3955ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
39569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39577e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3958fcdce8c4SStefano Zampini     a->rmax = 0;
39599566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->a));
39609566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->j));
39619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39629566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
39639566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
39647e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39657e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39667e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39677e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3968fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39697e8381f9SStefano Zampini     }
3970fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39717e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
39739566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
39747e8381f9SStefano Zampini   } else {
39759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
39767e8381f9SStefano Zampini   }
39779566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
39787e8381f9SStefano Zampini 
39797e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3980e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
39819566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->nz));
39829566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
39837e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
39859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
39867e8381f9SStefano Zampini   PetscFunctionReturn(0);
39877e8381f9SStefano Zampini }
3988ed502f03SStefano Zampini 
3989219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3990219fbbafSJunchao Zhang {
3991219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
3992219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
3993cbc6b225SStefano Zampini   PetscBool          coo_basic = PETSC_TRUE;
3994219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3995219fbbafSJunchao Zhang 
3996219fbbafSJunchao Zhang   PetscFunctionBegin;
39979566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
39989566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
3999219fbbafSJunchao Zhang   if (coo_i) {
40009566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
4001219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4002219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4003cbc6b225SStefano Zampini         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4004219fbbafSJunchao Zhang       }
4005219fbbafSJunchao Zhang     }
4006219fbbafSJunchao Zhang   }
4007219fbbafSJunchao Zhang 
4008219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
40099566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4010219fbbafSJunchao Zhang   } else {
40119566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4012cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
40139566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4014219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4015219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
40169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
40179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
40189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
40199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4020219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4021219fbbafSJunchao Zhang   }
4022219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4023219fbbafSJunchao Zhang }
4024219fbbafSJunchao Zhang 
402577804d84SJunchao Zhang __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4026219fbbafSJunchao Zhang {
4027219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4028219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4029b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4030b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4031b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4032b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4033b6c38306SJunchao Zhang   }
4034219fbbafSJunchao Zhang }
4035219fbbafSJunchao Zhang 
4036219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4037219fbbafSJunchao Zhang {
4038219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4039219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4040219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4041219fbbafSJunchao Zhang   PetscMemType        memtype;
4042219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4043219fbbafSJunchao Zhang   PetscScalar         *Aa;
4044219fbbafSJunchao Zhang 
4045219fbbafSJunchao Zhang   PetscFunctionBegin;
4046219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
40479566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
4048219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
40499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
40509566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4051219fbbafSJunchao Zhang     }
4052219fbbafSJunchao Zhang 
40539566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
40549566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4055219fbbafSJunchao Zhang 
4056cbc6b225SStefano Zampini     if (Annz) {
4057b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
40589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4059cbc6b225SStefano Zampini     }
4060219fbbafSJunchao Zhang 
40619566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
40629566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4063219fbbafSJunchao Zhang 
40649566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4065219fbbafSJunchao Zhang   } else {
40669566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4067219fbbafSJunchao Zhang   }
4068219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4069219fbbafSJunchao Zhang }
4070219fbbafSJunchao Zhang 
40715b7e41feSStefano Zampini /*@C
40725b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40735b7e41feSStefano Zampini 
40745b7e41feSStefano Zampini    Not collective
40755b7e41feSStefano Zampini 
40765b7e41feSStefano Zampini     Input Parameters:
40775b7e41feSStefano Zampini +   A - the matrix
40785b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40795b7e41feSStefano Zampini 
40805b7e41feSStefano Zampini     Output Parameters:
40815b7e41feSStefano Zampini +   ia - the CSR row pointers
40825b7e41feSStefano Zampini -   ja - the CSR column indices
40835b7e41feSStefano Zampini 
40845b7e41feSStefano Zampini     Level: developer
40855b7e41feSStefano Zampini 
40865b7e41feSStefano Zampini     Notes:
40875b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
40885b7e41feSStefano Zampini 
4089db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
40905b7e41feSStefano Zampini @*/
40915f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
40925f101d05SStefano Zampini {
40935f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
40945f101d05SStefano Zampini   CsrMatrix          *csr;
40955f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
40965f101d05SStefano Zampini 
40975f101d05SStefano Zampini   PetscFunctionBegin;
40985f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
40995f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41005f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4101aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
410328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41045f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41055f101d05SStefano Zampini   if (i) {
41065f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41075f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41085f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41095f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41109566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
41115f101d05SStefano Zampini       }
41125f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41135f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41145f101d05SStefano Zampini   }
41155f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41165f101d05SStefano Zampini   PetscFunctionReturn(0);
41175f101d05SStefano Zampini }
41185f101d05SStefano Zampini 
41195b7e41feSStefano Zampini /*@C
41205b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41215b7e41feSStefano Zampini 
41225b7e41feSStefano Zampini    Not collective
41235b7e41feSStefano Zampini 
41245b7e41feSStefano Zampini     Input Parameters:
41255b7e41feSStefano Zampini +   A - the matrix
41265b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41275b7e41feSStefano Zampini 
41285b7e41feSStefano Zampini     Output Parameters:
41295b7e41feSStefano Zampini +   ia - the CSR row pointers
41305b7e41feSStefano Zampini -   ja - the CSR column indices
41315b7e41feSStefano Zampini 
41325b7e41feSStefano Zampini     Level: developer
41335b7e41feSStefano Zampini 
4134db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
41355b7e41feSStefano Zampini @*/
41365f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41375f101d05SStefano Zampini {
41385f101d05SStefano Zampini   PetscFunctionBegin;
41395f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41405f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41415f101d05SStefano Zampini   if (i) *i = NULL;
41425f101d05SStefano Zampini   if (j) *j = NULL;
41435f101d05SStefano Zampini   PetscFunctionReturn(0);
41445f101d05SStefano Zampini }
41455f101d05SStefano Zampini 
41465b7e41feSStefano Zampini /*@C
41475b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41485b7e41feSStefano Zampini 
41495b7e41feSStefano Zampini    Not Collective
41505b7e41feSStefano Zampini 
41515b7e41feSStefano Zampini    Input Parameter:
41525b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41535b7e41feSStefano Zampini 
41545b7e41feSStefano Zampini    Output Parameter:
41555b7e41feSStefano Zampini .   a - pointer to the device data
41565b7e41feSStefano Zampini 
41575b7e41feSStefano Zampini    Level: developer
41585b7e41feSStefano Zampini 
41595b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41605b7e41feSStefano Zampini 
4161db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
41625b7e41feSStefano Zampini @*/
4163ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4164ed502f03SStefano Zampini {
4165ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4166ed502f03SStefano Zampini   CsrMatrix          *csr;
4167ed502f03SStefano Zampini 
4168ed502f03SStefano Zampini   PetscFunctionBegin;
4169ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4170ed502f03SStefano Zampini   PetscValidPointer(a,2);
4171ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4172aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
417428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4175ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
417628b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4177ed502f03SStefano Zampini   *a = csr->values->data().get();
4178ed502f03SStefano Zampini   PetscFunctionReturn(0);
4179ed502f03SStefano Zampini }
4180ed502f03SStefano Zampini 
41815b7e41feSStefano Zampini /*@C
41825b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
41835b7e41feSStefano Zampini 
41845b7e41feSStefano Zampini    Not Collective
41855b7e41feSStefano Zampini 
41865b7e41feSStefano Zampini    Input Parameter:
41875b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41885b7e41feSStefano Zampini 
41895b7e41feSStefano Zampini    Output Parameter:
41905b7e41feSStefano Zampini .   a - pointer to the device data
41915b7e41feSStefano Zampini 
41925b7e41feSStefano Zampini    Level: developer
41935b7e41feSStefano Zampini 
4194db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
41955b7e41feSStefano Zampini @*/
4196ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4197ed502f03SStefano Zampini {
4198ed502f03SStefano Zampini   PetscFunctionBegin;
4199ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4200ed502f03SStefano Zampini   PetscValidPointer(a,2);
4201ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4202ed502f03SStefano Zampini   *a = NULL;
4203ed502f03SStefano Zampini   PetscFunctionReturn(0);
4204ed502f03SStefano Zampini }
4205ed502f03SStefano Zampini 
42065b7e41feSStefano Zampini /*@C
42075b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42085b7e41feSStefano Zampini 
42095b7e41feSStefano Zampini    Not Collective
42105b7e41feSStefano Zampini 
42115b7e41feSStefano Zampini    Input Parameter:
42125b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42135b7e41feSStefano Zampini 
42145b7e41feSStefano Zampini    Output Parameter:
42155b7e41feSStefano Zampini .   a - pointer to the device data
42165b7e41feSStefano Zampini 
42175b7e41feSStefano Zampini    Level: developer
42185b7e41feSStefano Zampini 
42195b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42205b7e41feSStefano Zampini 
4221db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
42225b7e41feSStefano Zampini @*/
4223039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4224039c6fbaSStefano Zampini {
4225039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4226039c6fbaSStefano Zampini   CsrMatrix          *csr;
4227039c6fbaSStefano Zampini 
4228039c6fbaSStefano Zampini   PetscFunctionBegin;
4229039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4230039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4231039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4232aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
42339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
423428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4235039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
423628b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4237039c6fbaSStefano Zampini   *a = csr->values->data().get();
4238039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
42399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4240039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4241039c6fbaSStefano Zampini }
42425b7e41feSStefano Zampini /*@C
42435b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4244039c6fbaSStefano Zampini 
42455b7e41feSStefano Zampini    Not Collective
42465b7e41feSStefano Zampini 
42475b7e41feSStefano Zampini    Input Parameter:
42485b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42495b7e41feSStefano Zampini 
42505b7e41feSStefano Zampini    Output Parameter:
42515b7e41feSStefano Zampini .   a - pointer to the device data
42525b7e41feSStefano Zampini 
42535b7e41feSStefano Zampini    Level: developer
42545b7e41feSStefano Zampini 
4255db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
42565b7e41feSStefano Zampini @*/
4257039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4258039c6fbaSStefano Zampini {
4259039c6fbaSStefano Zampini   PetscFunctionBegin;
4260039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4261039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4262039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
42649566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4265039c6fbaSStefano Zampini   *a = NULL;
4266039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4267039c6fbaSStefano Zampini }
4268039c6fbaSStefano Zampini 
42695b7e41feSStefano Zampini /*@C
42705b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42715b7e41feSStefano Zampini 
42725b7e41feSStefano Zampini    Not Collective
42735b7e41feSStefano Zampini 
42745b7e41feSStefano Zampini    Input Parameter:
42755b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42765b7e41feSStefano Zampini 
42775b7e41feSStefano Zampini    Output Parameter:
42785b7e41feSStefano Zampini .   a - pointer to the device data
42795b7e41feSStefano Zampini 
42805b7e41feSStefano Zampini    Level: developer
42815b7e41feSStefano Zampini 
42825b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
42835b7e41feSStefano Zampini 
4284db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
42855b7e41feSStefano Zampini @*/
4286ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4287ed502f03SStefano Zampini {
4288ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4289ed502f03SStefano Zampini   CsrMatrix          *csr;
4290ed502f03SStefano Zampini 
4291ed502f03SStefano Zampini   PetscFunctionBegin;
4292ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4293ed502f03SStefano Zampini   PetscValidPointer(a,2);
4294ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4295aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
429628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4297ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
429828b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4299ed502f03SStefano Zampini   *a = csr->values->data().get();
4300039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
43019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4302ed502f03SStefano Zampini   PetscFunctionReturn(0);
4303ed502f03SStefano Zampini }
4304ed502f03SStefano Zampini 
43055b7e41feSStefano Zampini /*@C
43065b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43075b7e41feSStefano Zampini 
43085b7e41feSStefano Zampini    Not Collective
43095b7e41feSStefano Zampini 
43105b7e41feSStefano Zampini    Input Parameter:
43115b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43125b7e41feSStefano Zampini 
43135b7e41feSStefano Zampini    Output Parameter:
43145b7e41feSStefano Zampini .   a - pointer to the device data
43155b7e41feSStefano Zampini 
43165b7e41feSStefano Zampini    Level: developer
43175b7e41feSStefano Zampini 
4318db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
43195b7e41feSStefano Zampini @*/
4320ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4321ed502f03SStefano Zampini {
4322ed502f03SStefano Zampini   PetscFunctionBegin;
4323ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4324ed502f03SStefano Zampini   PetscValidPointer(a,2);
4325ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
43279566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4328ed502f03SStefano Zampini   *a = NULL;
4329ed502f03SStefano Zampini   PetscFunctionReturn(0);
4330ed502f03SStefano Zampini }
4331ed502f03SStefano Zampini 
4332ed502f03SStefano Zampini struct IJCompare4
4333ed502f03SStefano Zampini {
4334ed502f03SStefano Zampini   __host__ __device__
43352ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4336ed502f03SStefano Zampini   {
4337ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4338ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4339ed502f03SStefano Zampini     return false;
4340ed502f03SStefano Zampini   }
4341ed502f03SStefano Zampini };
4342ed502f03SStefano Zampini 
43438909a122SStefano Zampini struct Shift
43448909a122SStefano Zampini {
4345ed502f03SStefano Zampini   int _shift;
4346ed502f03SStefano Zampini 
4347ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4348ed502f03SStefano Zampini   __host__ __device__
4349ed502f03SStefano Zampini   inline int operator() (const int &c)
4350ed502f03SStefano Zampini   {
4351ed502f03SStefano Zampini     return c + _shift;
4352ed502f03SStefano Zampini   }
4353ed502f03SStefano Zampini };
4354ed502f03SStefano Zampini 
4355ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4356ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4357ed502f03SStefano Zampini {
4358ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4359ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4360ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4361ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4362ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4363ed502f03SStefano Zampini   cusparseStatus_t             stat;
4364ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4365ed502f03SStefano Zampini 
4366ed502f03SStefano Zampini   PetscFunctionBegin;
4367ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4368ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4369ed502f03SStefano Zampini   PetscValidPointer(C,4);
4370ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4371ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
43725f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
437308401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4374aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4375aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4376ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4377ed502f03SStefano Zampini     m     = A->rmap->n;
4378ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
43799566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,C));
43809566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C,m,n,m,n));
43819566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4382ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4383ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4384ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4385ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4386ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4387ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4388ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4389ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4390ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4391ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4392ed502f03SStefano Zampini     Ccusp->nrows    = m;
4393ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4394ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4395ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4396ed502f03SStefano Zampini     Ccsr->num_cols  = n;
43979566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
43989566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
43999566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
44009566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
44019566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
44029566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
44039566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44049566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
44059566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44069566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
440828b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
440928b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4410ed502f03SStefano Zampini 
4411ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4412ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4413ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4414ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4415ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4416ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4417ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4418ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4419ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4420ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4421ed502f03SStefano Zampini     if (c->nz) {
44222ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44232ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44242ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44252ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44262ed87e7eSStefano Zampini 
4427ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4428ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4429ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4430ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
44319566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4432ed502f03SStefano Zampini         }
44332ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44342ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4435ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4436ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4437ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4438ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
44399566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4440ed502f03SStefano Zampini         }
44412ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44422ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
44439566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
44442ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44452ed87e7eSStefano Zampini                               Aroff->data().get(),
44462ed87e7eSStefano Zampini                               Annz,
44472ed87e7eSStefano Zampini                               m,
44482ed87e7eSStefano Zampini                               Acoo->data().get(),
44499566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4450ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44512ed87e7eSStefano Zampini                               Broff->data().get(),
4452ed502f03SStefano Zampini                               Bnnz,
4453ed502f03SStefano Zampini                               m,
44542ed87e7eSStefano Zampini                               Bcoo->data().get(),
44559566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
44562ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44572ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44582ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44598909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4460ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4461ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44628909a122SStefano Zampini #else
44638909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44648909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44658909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44668909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44678909a122SStefano Zampini #endif
44682ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44692ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44702ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44712ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44722ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44732ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4474ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4475ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4476ed502f03SStefano Zampini       thrust::advance(p2,Annz);
44772ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
44788909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
44798909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
44808909a122SStefano Zampini #endif
44812ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
44822ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
44832ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
44842ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44852ed87e7eSStefano Zampini #else
44862ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
44872ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
44882ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
44892ed87e7eSStefano Zampini #endif
4490ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
44912ed87e7eSStefano Zampini                               Ccoo->data().get(),
4492ed502f03SStefano Zampini                               c->nz,
4493ed502f03SStefano Zampini                               m,
4494ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
44959566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
44969566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
44972ed87e7eSStefano Zampini       delete wPerm;
44982ed87e7eSStefano Zampini       delete Acoo;
44992ed87e7eSStefano Zampini       delete Bcoo;
45002ed87e7eSStefano Zampini       delete Ccoo;
4501ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4502ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4503ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4504ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45059566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4506ed502f03SStefano Zampini #endif
45071a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45089566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
45099566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4510ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4511ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4512ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4513ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4514ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4515ed502f03SStefano Zampini 
45161a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45171a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4518a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4519ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4520ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4521ed502f03SStefano Zampini         CcsrT->num_rows = n;
4522ed502f03SStefano Zampini         CcsrT->num_cols = m;
4523ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4524ed502f03SStefano Zampini 
4525ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4526ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4527ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4528ed502f03SStefano Zampini 
45299566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4530ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4531ed502f03SStefano Zampini         if (AT) {
4532ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4533ed502f03SStefano Zampini           thrust::advance(rT,-1);
4534ed502f03SStefano Zampini         }
4535ed502f03SStefano Zampini         if (BT) {
4536ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4537ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4538ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4539ed502f03SStefano Zampini         }
4540ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4541ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4542ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4543ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4544ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4545ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
45469566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4547ed502f03SStefano Zampini 
45489566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
45499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
45509566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
45519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
45529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
45539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
45549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
45559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
45569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4557ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4558ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4559ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4560ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45619566063dSJacob Faibussowitsch                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4562ed502f03SStefano Zampini #endif
4563ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4564ed502f03SStefano Zampini       }
4565ed502f03SStefano Zampini     }
4566ed502f03SStefano Zampini 
4567ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4568ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4569ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
45709566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m+1,&c->i));
45719566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->j));
4572ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4573ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4574ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4575ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4576ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
45779566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45789566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4579ed502f03SStefano Zampini     } else {
45809566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4582ed502f03SStefano Zampini     }
45839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
45849566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->ilen));
45859566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->imax));
4586ed502f03SStefano Zampini     c->maxnz = c->nz;
4587ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4588ed502f03SStefano Zampini     c->rmax = 0;
4589ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4590ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4591ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4592ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4593ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4594ed502f03SStefano Zampini     }
45959566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
45969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->a));
4597ed502f03SStefano Zampini     (*C)->nonzerostate++;
45989566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
45999566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4600ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4601ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4602ed502f03SStefano Zampini   } else {
460308401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4604ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4605ed502f03SStefano Zampini     if (c->nz) {
4606ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
46075f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4608aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
460908401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
46109566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46119566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
46125f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
46135f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4614ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4615ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4616ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4617aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4618aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4619aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4620aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
46215f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4622ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4623ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
46249566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
4625ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4626ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4627ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4628ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4629ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4630ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4631ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4632ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4633ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4634ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
46359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
46361a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
46375f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4638ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4639ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4640ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4641ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4642ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4643ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4644ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46451a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4646ed502f03SStefano Zampini       }
46479566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4648ed502f03SStefano Zampini     }
4649ed502f03SStefano Zampini   }
46509566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4651ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4652ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4653ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4654ed502f03SStefano Zampini   PetscFunctionReturn(0);
4655ed502f03SStefano Zampini }
4656c215019aSStefano Zampini 
4657c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4658c215019aSStefano Zampini {
4659c215019aSStefano Zampini   bool              dmem;
4660c215019aSStefano Zampini   const PetscScalar *av;
4661c215019aSStefano Zampini 
4662c215019aSStefano Zampini   PetscFunctionBegin;
4663c215019aSStefano Zampini   dmem = isCudaMem(v);
46649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4665c215019aSStefano Zampini   if (n && idx) {
4666c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4667c215019aSStefano Zampini     widx.assign(idx,idx+n);
46689566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4669c215019aSStefano Zampini 
4670c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4671c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4672c215019aSStefano Zampini     if (dmem) {
4673c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4674c215019aSStefano Zampini     } else {
4675c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4676c215019aSStefano Zampini       dv = w->data();
4677c215019aSStefano Zampini     }
4678c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4679c215019aSStefano Zampini 
4680c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4681c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4682c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4683c215019aSStefano Zampini     if (w) {
46849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4685c215019aSStefano Zampini     }
4686c215019aSStefano Zampini     delete w;
4687c215019aSStefano Zampini   } else {
46889566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4689c215019aSStefano Zampini   }
46909566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
46919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4692c215019aSStefano Zampini   PetscFunctionReturn(0);
4693c215019aSStefano Zampini }
4694