xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d0967f542ebb93126ab9aa87c257c55864287953)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16*d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
17*d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1
18*d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14
19a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
20*d0967f54SJacob Faibussowitsch #endif
21a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
22a2cee5feSJed Brown #include <thrust/remove.h>
23a2cee5feSJed Brown #include <thrust/sort.h>
24a2cee5feSJed Brown #include <thrust/unique.h>
25e8d2b73aSMark Adams 
26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30afb2bd1cSJunchao Zhang 
31afb2bd1cSJunchao Zhang   typedef enum {
32afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
36afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
37afb2bd1cSJunchao Zhang 
38afb2bd1cSJunchao Zhang   typedef enum {
39afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
51afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
52afb2bd1cSJunchao Zhang 
53afb2bd1cSJunchao Zhang   typedef enum {
54afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
55afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
56afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
57afb2bd1cSJunchao Zhang   */
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61afb2bd1cSJunchao Zhang #endif
629ae82921SPaul Mullowney 
63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
696fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70087f3262SPaul Mullowney 
716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
726fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
746fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
816fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
859ae82921SPaul Mullowney 
867f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
917f756511SDominic Meiser 
9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9457181aedSStefano Zampini 
95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98c215019aSStefano Zampini 
999371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
100aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1016e111a19SKarl Rupp 
102ca45077fSPaul Mullowney   PetscFunctionBegin;
103ca45077fSPaul Mullowney   switch (op) {
1049371c9d4SSatish Balay   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
1059371c9d4SSatish Balay   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
1069371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
107ca45077fSPaul Mullowney   }
108ca45077fSPaul Mullowney   PetscFunctionReturn(0);
109ca45077fSPaul Mullowney }
1109ae82921SPaul Mullowney 
111e057df02SPaul Mullowney /*@
11211a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11311a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
11411a5261eSBarry Smith 
115e057df02SPaul Mullowney    Not Collective
116e057df02SPaul Mullowney 
117e057df02SPaul Mullowney    Input Parameters:
11811a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
11911a5261eSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
12011a5261eSBarry Smith         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12111a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney    Output Parameter:
124e057df02SPaul Mullowney 
125e057df02SPaul Mullowney    Level: intermediate
126e057df02SPaul Mullowney 
12711a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
128e057df02SPaul Mullowney @*/
1299371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
130e057df02SPaul Mullowney   PetscFunctionBegin;
131e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
132cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
133e057df02SPaul Mullowney   PetscFunctionReturn(0);
134e057df02SPaul Mullowney }
135e057df02SPaul Mullowney 
1369371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
137365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
138365b711fSMark Adams 
139365b711fSMark Adams   PetscFunctionBegin;
140365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
141365b711fSMark Adams   PetscFunctionReturn(0);
142365b711fSMark Adams }
143365b711fSMark Adams 
144365b711fSMark Adams /*@
14511a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
146365b711fSMark Adams 
147365b711fSMark Adams    Input Parameters:
14811a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
14911a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
150365b711fSMark Adams 
151365b711fSMark Adams    Output Parameter:
152365b711fSMark Adams 
15311a5261eSBarry Smith    Note:
154365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
155365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
156365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
157365b711fSMark Adams 
158365b711fSMark Adams    Level: intermediate
159365b711fSMark Adams 
16011a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
161365b711fSMark Adams @*/
1629371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
163365b711fSMark Adams   PetscFunctionBegin;
164365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
165cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
166365b711fSMark Adams   PetscFunctionReturn(0);
167365b711fSMark Adams }
168365b711fSMark Adams 
1699371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
170e6e9a74fSStefano Zampini   PetscFunctionBegin;
1711a2c6b5cSJunchao Zhang   switch (op) {
1721a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1731a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1749566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1751a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1761a2c6b5cSJunchao Zhang     break;
1779371c9d4SSatish Balay   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
178e6e9a74fSStefano Zampini   }
179e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
180e6e9a74fSStefano Zampini }
181e6e9a74fSStefano Zampini 
182bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
183bddcd29dSMark Adams 
1849371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
185bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
186bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
187bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
188365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
189bddcd29dSMark Adams 
190bddcd29dSMark Adams   PetscFunctionBegin;
1919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1929566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
193bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
194bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
1959566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
1969566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
197f93f8571SJunchao Zhang 
198365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
199f93f8571SJunchao Zhang     if (row_identity && col_identity) {
200bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
201bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
202bddcd29dSMark Adams     } else {
203bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
204bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
205365b711fSMark Adams     }
206f93f8571SJunchao Zhang   }
207bddcd29dSMark Adams   B->ops->matsolve          = NULL;
208bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
209bddcd29dSMark Adams 
210bddcd29dSMark Adams   /* get the triangular factors */
21148a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
212bddcd29dSMark Adams   PetscFunctionReturn(0);
213bddcd29dSMark Adams }
214bddcd29dSMark Adams 
2159371c9d4SSatish Balay static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
216e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2179ae82921SPaul Mullowney   PetscBool                flg;
218a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2196e111a19SKarl Rupp 
2209ae82921SPaul Mullowney   PetscFunctionBegin;
221d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2229ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2239371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2249566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
225afb2bd1cSJunchao Zhang 
2269371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2279566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2289566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2299566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
230afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2319371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
232afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
233ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
234aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
235a435da06SStefano Zampini #else
236aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
237a435da06SStefano Zampini #endif
2389371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
239aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
240afb2bd1cSJunchao Zhang 
2419371c9d4SSatish Balay     PetscCall(
2429371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
243aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
244afb2bd1cSJunchao Zhang #endif
2454c87dfd4SPaul Mullowney   }
246d0609cedSBarry Smith   PetscOptionsHeadEnd();
2479ae82921SPaul Mullowney   PetscFunctionReturn(0);
2489ae82921SPaul Mullowney }
2499ae82921SPaul Mullowney 
2509371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
2519ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2529ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2539ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
254aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2559ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2569ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2579ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2589ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2599ae82921SPaul Mullowney 
2609ae82921SPaul Mullowney   PetscFunctionBegin;
261cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
262c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2639ae82921SPaul Mullowney     try {
2649ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2659ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
266da79fbbcSStefano Zampini       if (!loTriFactor) {
2672cbc15d9SMark         PetscScalar *AALo;
2682cbc15d9SMark 
2699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2709ae82921SPaul Mullowney 
2719ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2749ae82921SPaul Mullowney 
2759ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2769ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2779ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2789ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2799ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2809ae82921SPaul Mullowney         v         = aa;
2819ae82921SPaul Mullowney         vi        = aj;
2829ae82921SPaul Mullowney         offset    = 1;
2839ae82921SPaul Mullowney         rowOffset = 1;
2849ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
2859ae82921SPaul Mullowney           nz      = ai[i + 1] - ai[i];
286e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
2879ae82921SPaul Mullowney           AiLo[i] = rowOffset;
2889ae82921SPaul Mullowney           rowOffset += nz + 1;
2899ae82921SPaul Mullowney 
2909566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
2919566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
2929ae82921SPaul Mullowney 
2939ae82921SPaul Mullowney           offset += nz;
2949ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
2959ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
2969ae82921SPaul Mullowney           offset += 1;
2979ae82921SPaul Mullowney 
2989ae82921SPaul Mullowney           v += nz;
2999ae82921SPaul Mullowney           vi += nz;
3009ae82921SPaul Mullowney         }
3012205254eSKarl Rupp 
302aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3039566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
304da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
305aa372e3fSPaul Mullowney         /* Create the matrix description */
3069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
310afb2bd1cSJunchao Zhang #else
3119566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
312afb2bd1cSJunchao Zhang #endif
3139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
315aa372e3fSPaul Mullowney 
316aa372e3fSPaul Mullowney         /* set the operation */
317aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
318aa372e3fSPaul Mullowney 
319aa372e3fSPaul Mullowney         /* set the matrix */
320aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
321aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
322aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
323aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
324aa372e3fSPaul Mullowney 
325aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
326aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
327aa372e3fSPaul Mullowney 
328aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
329aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
330aa372e3fSPaul Mullowney 
331aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
332aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
333aa372e3fSPaul Mullowney 
334afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3359566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
336261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3371b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3389371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3399371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
341afb2bd1cSJunchao Zhang #endif
342afb2bd1cSJunchao Zhang 
343aa372e3fSPaul Mullowney         /* perform the solve analysis */
3449371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3459371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
3461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3479371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
348d49cd2b7SBarry Smith #else
3495f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
350afb2bd1cSJunchao Zhang #endif
3519566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3529566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
353aa372e3fSPaul Mullowney 
354da79fbbcSStefano Zampini         /* assign the pointer */
355aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3562cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3599566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
360da79fbbcSStefano Zampini       } else { /* update values only */
36148a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
362da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3632cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
364da79fbbcSStefano Zampini         v                    = aa;
365da79fbbcSStefano Zampini         vi                   = aj;
366da79fbbcSStefano Zampini         offset               = 1;
367da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
368da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3699566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
370da79fbbcSStefano Zampini           offset += nz;
3712cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
372da79fbbcSStefano Zampini           offset += 1;
373da79fbbcSStefano Zampini           v += nz;
374da79fbbcSStefano Zampini         }
3752cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3769566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
377da79fbbcSStefano Zampini       }
3789371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3799ae82921SPaul Mullowney   }
3809ae82921SPaul Mullowney   PetscFunctionReturn(0);
3819ae82921SPaul Mullowney }
3829ae82921SPaul Mullowney 
3839371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
3849ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3859ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3869ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
387aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
3889ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
3899ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3909ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3919ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney   PetscFunctionBegin;
394cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
395c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3969ae82921SPaul Mullowney     try {
3979ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3989ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
399da79fbbcSStefano Zampini       if (!upTriFactor) {
4002cbc15d9SMark         PetscScalar *AAUp;
4012cbc15d9SMark 
4029566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4032cbc15d9SMark 
4049ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4069566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4079ae82921SPaul Mullowney 
4089ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4099ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4109ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4119ae82921SPaul Mullowney         offset  = nzUpper;
4129ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4139ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4149ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4159ae82921SPaul Mullowney 
416e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4179ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4189ae82921SPaul Mullowney 
419e057df02SPaul Mullowney           /* decrement the offset */
4209ae82921SPaul Mullowney           offset -= (nz + 1);
4219ae82921SPaul Mullowney 
422e057df02SPaul Mullowney           /* first, set the diagonal elements */
4239ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
42409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4259ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4269ae82921SPaul Mullowney 
4279566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4289566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4299ae82921SPaul Mullowney         }
4302205254eSKarl Rupp 
431aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4329566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
433da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4342205254eSKarl Rupp 
435aa372e3fSPaul Mullowney         /* Create the matrix description */
4369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
440afb2bd1cSJunchao Zhang #else
4419566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
442afb2bd1cSJunchao Zhang #endif
4439566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4449566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
445aa372e3fSPaul Mullowney 
446aa372e3fSPaul Mullowney         /* set the operation */
447aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney         /* set the matrix */
450aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
451aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
452aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
453aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
454aa372e3fSPaul Mullowney 
455aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
456aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
457aa372e3fSPaul Mullowney 
458aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
459aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
460aa372e3fSPaul Mullowney 
461aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
462aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
463aa372e3fSPaul Mullowney 
464afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4659566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
466261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4689371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4699371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
471afb2bd1cSJunchao Zhang #endif
472afb2bd1cSJunchao Zhang 
473aa372e3fSPaul Mullowney         /* perform the solve analysis */
4749371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4759371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
4761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4779371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
478d49cd2b7SBarry Smith #else
4795f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
480afb2bd1cSJunchao Zhang #endif
4819566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4829566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
483aa372e3fSPaul Mullowney 
484da79fbbcSStefano Zampini         /* assign the pointer */
485aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4862cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4899566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
490da79fbbcSStefano Zampini       } else {
49148a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
492da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
493da79fbbcSStefano Zampini         offset = nzUpper;
494da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
495da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
496da79fbbcSStefano Zampini 
497da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
498da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
499da79fbbcSStefano Zampini 
500da79fbbcSStefano Zampini           /* decrement the offset */
501da79fbbcSStefano Zampini           offset -= (nz + 1);
502da79fbbcSStefano Zampini 
503da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5042cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5059566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
506da79fbbcSStefano Zampini         }
5072cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5089566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
509da79fbbcSStefano Zampini       }
5109371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
5119ae82921SPaul Mullowney   }
5129ae82921SPaul Mullowney   PetscFunctionReturn(0);
5139ae82921SPaul Mullowney }
5149ae82921SPaul Mullowney 
5159371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
5169ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5179ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5189ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5199ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5209ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5219ae82921SPaul Mullowney 
5229ae82921SPaul Mullowney   PetscFunctionBegin;
52328b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5262205254eSKarl Rupp 
527ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
528aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5299ae82921SPaul Mullowney 
530c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
531e057df02SPaul Mullowney   /* lower triangular indices */
5329566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
533da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
534da79fbbcSStefano Zampini     const PetscInt *r;
535da79fbbcSStefano Zampini 
5369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
537aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
538aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5399566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5409566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
541da79fbbcSStefano Zampini   }
5429ae82921SPaul Mullowney 
543e057df02SPaul Mullowney   /* upper triangular indices */
5449566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
545da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
546da79fbbcSStefano Zampini     const PetscInt *c;
547da79fbbcSStefano Zampini 
5489566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
549aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
550aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5519566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5529566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
553da79fbbcSStefano Zampini   }
5549ae82921SPaul Mullowney   PetscFunctionReturn(0);
5559ae82921SPaul Mullowney }
5569ae82921SPaul Mullowney 
5579371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
558087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
559087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
560aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
561aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
562087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
563087f3262SPaul Mullowney   PetscScalar                       *AAUp;
564087f3262SPaul Mullowney   PetscScalar                       *AALo;
565087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
566087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
567087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
568087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
569087f3262SPaul Mullowney 
570087f3262SPaul Mullowney   PetscFunctionBegin;
571cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
572c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
573087f3262SPaul Mullowney     try {
5749566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5759566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
576da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
577087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
580087f3262SPaul Mullowney 
581087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
582087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
583087f3262SPaul Mullowney         AiUp[n] = nzUpper;
584087f3262SPaul Mullowney         offset  = 0;
585087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
586087f3262SPaul Mullowney           /* set the pointers */
587087f3262SPaul Mullowney           v  = aa + ai[i];
588087f3262SPaul Mullowney           vj = aj + ai[i];
589087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
590087f3262SPaul Mullowney 
591087f3262SPaul Mullowney           /* first, set the diagonal elements */
592087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
59309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
594087f3262SPaul Mullowney           AiUp[i]      = offset;
59509f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
596087f3262SPaul Mullowney 
597087f3262SPaul Mullowney           offset += 1;
598087f3262SPaul Mullowney           if (nz > 0) {
5999566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6009566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
601087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
602087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
603087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
604087f3262SPaul Mullowney             }
605087f3262SPaul Mullowney             offset += nz;
606087f3262SPaul Mullowney           }
607087f3262SPaul Mullowney         }
608087f3262SPaul Mullowney 
609aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6109566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
611da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
612087f3262SPaul Mullowney 
613aa372e3fSPaul Mullowney         /* Create the matrix description */
6149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
618afb2bd1cSJunchao Zhang #else
6199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
620afb2bd1cSJunchao Zhang #endif
6219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
623087f3262SPaul Mullowney 
624aa372e3fSPaul Mullowney         /* set the matrix */
625aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
626aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
627aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
628aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
629aa372e3fSPaul Mullowney 
630aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
631aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
632aa372e3fSPaul Mullowney 
633aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
634aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
635aa372e3fSPaul Mullowney 
636aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
637aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
638aa372e3fSPaul Mullowney 
639afb2bd1cSJunchao Zhang         /* set the operation */
640afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
641afb2bd1cSJunchao Zhang 
642afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6439566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
644261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6469371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6479371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6489566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
649afb2bd1cSJunchao Zhang #endif
650afb2bd1cSJunchao Zhang 
651aa372e3fSPaul Mullowney         /* perform the solve analysis */
6529371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6539371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
6541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6559371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
656d49cd2b7SBarry Smith #else
6575f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
658afb2bd1cSJunchao Zhang #endif
6599566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
661aa372e3fSPaul Mullowney 
662da79fbbcSStefano Zampini         /* assign the pointer */
663aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
664aa372e3fSPaul Mullowney 
665aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6669566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
667da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
668aa372e3fSPaul Mullowney 
669aa372e3fSPaul Mullowney         /* Create the matrix description */
6709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
674afb2bd1cSJunchao Zhang #else
6759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
676afb2bd1cSJunchao Zhang #endif
6779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
679aa372e3fSPaul Mullowney 
680aa372e3fSPaul Mullowney         /* set the operation */
681aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
682aa372e3fSPaul Mullowney 
683aa372e3fSPaul Mullowney         /* set the matrix */
684aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
685aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
686aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
687aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
688aa372e3fSPaul Mullowney 
689aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
690aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
691aa372e3fSPaul Mullowney 
692aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
693aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
694aa372e3fSPaul Mullowney 
695aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
696aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
697aa372e3fSPaul Mullowney 
698afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6999566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
700261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7029371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7039371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7049566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
705afb2bd1cSJunchao Zhang #endif
706afb2bd1cSJunchao Zhang 
707aa372e3fSPaul Mullowney         /* perform the solve analysis */
7089371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7099371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
7101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7119371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
712d49cd2b7SBarry Smith #else
7135f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
714afb2bd1cSJunchao Zhang #endif
7159566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7169566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
717aa372e3fSPaul Mullowney 
718da79fbbcSStefano Zampini         /* assign the pointer */
719aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
720087f3262SPaul Mullowney 
7219566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
724da79fbbcSStefano Zampini       } else {
725da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
726da79fbbcSStefano Zampini         offset = 0;
727da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
728da79fbbcSStefano Zampini           /* set the pointers */
729da79fbbcSStefano Zampini           v  = aa + ai[i];
730da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
731da79fbbcSStefano Zampini 
732da79fbbcSStefano Zampini           /* first, set the diagonal elements */
733da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
734da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
735da79fbbcSStefano Zampini 
736da79fbbcSStefano Zampini           offset += 1;
737da79fbbcSStefano Zampini           if (nz > 0) {
7389566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
740da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
741da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
742da79fbbcSStefano Zampini             }
743da79fbbcSStefano Zampini             offset += nz;
744da79fbbcSStefano Zampini           }
745da79fbbcSStefano Zampini         }
74628b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
74728b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7509566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751da79fbbcSStefano Zampini       }
7529566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
7549371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
755087f3262SPaul Mullowney   }
756087f3262SPaul Mullowney   PetscFunctionReturn(0);
757087f3262SPaul Mullowney }
758087f3262SPaul Mullowney 
7599371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
760087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
761087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
762087f3262SPaul Mullowney   IS                            ip                 = a->row;
763087f3262SPaul Mullowney   PetscBool                     perm_identity;
764087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
765087f3262SPaul Mullowney 
766087f3262SPaul Mullowney   PetscFunctionBegin;
76728b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
769ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
770aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
771aa372e3fSPaul Mullowney 
772da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
773da79fbbcSStefano Zampini 
774087f3262SPaul Mullowney   /* lower triangular indices */
7759566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
776087f3262SPaul Mullowney   if (!perm_identity) {
7774e4bbfaaSStefano Zampini     IS              iip;
778da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7794e4bbfaaSStefano Zampini 
7809566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7819566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7829566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
783aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
784aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
785aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7864e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7879566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7889566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7899566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7909566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
791da79fbbcSStefano Zampini   }
792087f3262SPaul Mullowney   PetscFunctionReturn(0);
793087f3262SPaul Mullowney }
794087f3262SPaul Mullowney 
7959371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
796087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
797087f3262SPaul Mullowney   IS          ip = b->row;
798087f3262SPaul Mullowney   PetscBool   perm_identity;
799087f3262SPaul Mullowney 
800087f3262SPaul Mullowney   PetscFunctionBegin;
8019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8029566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
803ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
804087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8059566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
806087f3262SPaul Mullowney   if (perm_identity) {
807087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
808087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8094e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8104e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
811087f3262SPaul Mullowney   } else {
812087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
813087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8144e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8154e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
816087f3262SPaul Mullowney   }
817087f3262SPaul Mullowney 
818087f3262SPaul Mullowney   /* get the triangular factors */
8199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
820087f3262SPaul Mullowney   PetscFunctionReturn(0);
821087f3262SPaul Mullowney }
8229ae82921SPaul Mullowney 
8239371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
824bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
825aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
826aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
827da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
828da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
829aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
830aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
831aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
832aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
833b175d8bbSPaul Mullowney 
834bda325fcSPaul Mullowney   PetscFunctionBegin;
835aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8369566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
837da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
838aa372e3fSPaul Mullowney 
839aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
840aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
841aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8429371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
843aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney   /* Create the matrix description */
8469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
851aa372e3fSPaul Mullowney 
852aa372e3fSPaul Mullowney   /* set the operation */
853aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
854aa372e3fSPaul Mullowney 
855aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
856aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
857afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
858afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
859aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
860afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
861afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
862afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
863aa372e3fSPaul Mullowney 
864aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
865afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8679371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8689371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8699566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
870afb2bd1cSJunchao Zhang #endif
871afb2bd1cSJunchao Zhang 
8729566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8749371c9d4SSatish Balay                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8769371c9d4SSatish Balay                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
877afb2bd1cSJunchao Zhang #else
8789371c9d4SSatish Balay                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
879afb2bd1cSJunchao Zhang #endif
8809566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8819566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
882aa372e3fSPaul Mullowney 
883afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
885261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
8861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8879371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8889371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
8899566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
890afb2bd1cSJunchao Zhang #endif
891afb2bd1cSJunchao Zhang 
892afb2bd1cSJunchao Zhang   /* perform the solve analysis */
8939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8949371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
8951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8969371c9d4SSatish Balay                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
897d49cd2b7SBarry Smith #else
8985f80ce2aSJacob Faibussowitsch                                             loTriFactorT->solveInfo));
899afb2bd1cSJunchao Zhang #endif
9009566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9019566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
902aa372e3fSPaul Mullowney 
903da79fbbcSStefano Zampini   /* assign the pointer */
904aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
905aa372e3fSPaul Mullowney 
906aa372e3fSPaul Mullowney   /*********************************************/
907aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
908aa372e3fSPaul Mullowney   /*********************************************/
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9119566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
912da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
913aa372e3fSPaul Mullowney 
914aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
915aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
916aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9179371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
918aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
919aa372e3fSPaul Mullowney 
920aa372e3fSPaul Mullowney   /* Create the matrix description */
9219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9239566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9249566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9259566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
926aa372e3fSPaul Mullowney 
927aa372e3fSPaul Mullowney   /* set the operation */
928aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
929aa372e3fSPaul Mullowney 
930aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
931aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
932afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
933afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
934aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
935afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
936afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
937afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
938aa372e3fSPaul Mullowney 
939aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
940afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9419371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9429371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9439371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
945afb2bd1cSJunchao Zhang #endif
946afb2bd1cSJunchao Zhang 
9479566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9489371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9499371c9d4SSatish Balay                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
950afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9519371c9d4SSatish Balay                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
952afb2bd1cSJunchao Zhang #else
9539371c9d4SSatish Balay                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
954afb2bd1cSJunchao Zhang #endif
955d49cd2b7SBarry Smith 
9569566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9579566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
958aa372e3fSPaul Mullowney 
959afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9609566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
961261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9649371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9659566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
966afb2bd1cSJunchao Zhang #endif
967afb2bd1cSJunchao Zhang 
968afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9695f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9709371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9719371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
9721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9739371c9d4SSatish Balay                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
974d49cd2b7SBarry Smith #else
9755f80ce2aSJacob Faibussowitsch                                             upTriFactorT->solveInfo));
976afb2bd1cSJunchao Zhang #endif
977d49cd2b7SBarry Smith 
9789566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9799566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
980aa372e3fSPaul Mullowney 
981da79fbbcSStefano Zampini   /* assign the pointer */
982aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
983bda325fcSPaul Mullowney   PetscFunctionReturn(0);
984bda325fcSPaul Mullowney }
985bda325fcSPaul Mullowney 
9869371c9d4SSatish Balay struct PetscScalarToPetscInt {
9879371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
988a49f1ed0SStefano Zampini };
989a49f1ed0SStefano Zampini 
9909371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
991aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
992a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
993bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
994bda325fcSPaul Mullowney   cusparseStatus_t              stat;
995aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
996b175d8bbSPaul Mullowney 
997bda325fcSPaul Mullowney   PetscFunctionBegin;
9989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
999a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
100028b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1001a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
100208401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
10031a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
10049566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
100648a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1007a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1008aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10099566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1010aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10119566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10129566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1013aa372e3fSPaul Mullowney 
1014b06137fdSPaul Mullowney     /* set alpha and beta */
10159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1021b06137fdSPaul Mullowney 
1022aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1023aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1024a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1025554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1026554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1027aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1028a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1029aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1030aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1031a3fdcf43SKarl Rupp 
1032ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
103381902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1034afb2bd1cSJunchao Zhang 
1035afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10363606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10379371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10389371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10399371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10403606e59fSJunchao Zhang #else
10413606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10423606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10433606e59fSJunchao Zhang 
10443606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10453606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10463606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10473606e59fSJunchao Zhang         */
10483606e59fSJunchao Zhang       if (matrixT->num_entries) {
10499371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10509371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10513606e59fSJunchao Zhang 
10523606e59fSJunchao Zhang       } else {
10533606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10543606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10553606e59fSJunchao Zhang       }
10563606e59fSJunchao Zhang #endif
1057afb2bd1cSJunchao Zhang #endif
1058aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1059afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1060afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1061afb2bd1cSJunchao Zhang #else
1062aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
106351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
106451c6d536SStefano Zampini       /* First convert HYB to CSR */
1065aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1066aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1067aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1068aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1069aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1070aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1071aa372e3fSPaul Mullowney 
10729371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10739371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1074aa372e3fSPaul Mullowney 
1075aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1076aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1077aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1078aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1079aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1080aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1081aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1082aa372e3fSPaul Mullowney 
10839371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10849371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10859371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1086aa372e3fSPaul Mullowney 
1087aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1088aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
10899566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
10909371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
10919371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
10929371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1093aa372e3fSPaul Mullowney 
1094aa372e3fSPaul Mullowney       /* assign the pointer */
1095aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
10961a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1097aa372e3fSPaul Mullowney       /* delete temporaries */
1098aa372e3fSPaul Mullowney       if (tempT) {
1099aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1100aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1101aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1102aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1103087f3262SPaul Mullowney       }
1104aa372e3fSPaul Mullowney       if (temp) {
1105aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1106aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1107aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1108aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1109aa372e3fSPaul Mullowney       }
1110afb2bd1cSJunchao Zhang #endif
1111aa372e3fSPaul Mullowney     }
1112a49f1ed0SStefano Zampini   }
1113a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1114a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1115a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
111628b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
111728b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
111828b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
111928b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
112028b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
112128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
112228b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
112328b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1124a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1125a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1126a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11279566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1128a49f1ed0SStefano Zampini     }
1129a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1130a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1131792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1132a49f1ed0SStefano Zampini 
1133a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1134a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1135a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1136a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11379371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11389371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11399371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11409566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1141a49f1ed0SStefano Zampini #endif
1142a49f1ed0SStefano Zampini 
11431a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11441a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11451a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11461a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11471a2c6b5cSJunchao Zhang 
11481a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11491a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11501a2c6b5cSJunchao Zhang         */
11519371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1152a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11539371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11549371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1155a49f1ed0SStefano Zampini #else
11569371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11579371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1158a49f1ed0SStefano Zampini #endif
11591a2c6b5cSJunchao Zhang       } else {
11601a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11611a2c6b5cSJunchao Zhang       }
11621a2c6b5cSJunchao Zhang 
1163a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1164792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1165a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11669566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1167a49f1ed0SStefano Zampini #endif
1168a49f1ed0SStefano Zampini     }
11699371c9d4SSatish Balay     PetscCallThrust(
11709371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1171a49f1ed0SStefano Zampini   }
11729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11739566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1174213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1175213423ffSJunchao Zhang   matstructT->cprowIndices                       = NULL;
1176aa372e3fSPaul Mullowney   /* assign the pointer */
1177aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11781a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1179bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1180bda325fcSPaul Mullowney }
1181bda325fcSPaul Mullowney 
1182a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
11839371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1184c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1185465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1186465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1187465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1188465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1189bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1190bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1191aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1192aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1193aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1194bda325fcSPaul Mullowney 
1195bda325fcSPaul Mullowney   PetscFunctionBegin;
1196aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1197aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
11989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1199aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1200aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1201bda325fcSPaul Mullowney   }
1202bda325fcSPaul Mullowney 
1203bda325fcSPaul Mullowney   /* Get the GPU pointers */
12049566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1206c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1207c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1208bda325fcSPaul Mullowney 
12099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1210aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12119371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1212aa372e3fSPaul Mullowney 
1213aa372e3fSPaul Mullowney   /* First, solve U */
12149371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1216afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1217afb2bd1cSJunchao Zhang #endif
12189371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
12191b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12209371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1222d49cd2b7SBarry Smith #else
12239371c9d4SSatish Balay                               tempGPU->data().get());
12249371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1225afb2bd1cSJunchao Zhang #endif
1226aa372e3fSPaul Mullowney 
1227aa372e3fSPaul Mullowney   /* Then, solve L */
12289371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1230afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1231afb2bd1cSJunchao Zhang #endif
12329371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1233d49cd2b7SBarry Smith                               tempGPU->data().get(),
12341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12359371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12369371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1237d49cd2b7SBarry Smith #else
12389371c9d4SSatish Balay                               xarray);
12399371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1240afb2bd1cSJunchao Zhang #endif
1241aa372e3fSPaul Mullowney 
1242aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12439371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1244aa372e3fSPaul Mullowney 
1245aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1246a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1247bda325fcSPaul Mullowney 
1248bda325fcSPaul Mullowney   /* restore */
12499566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12509566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1253bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1254bda325fcSPaul Mullowney }
1255bda325fcSPaul Mullowney 
12569371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1257465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1258465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1259bda325fcSPaul Mullowney   cusparseStatus_t                   stat;
1260bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1261aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1262aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1263aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1264bda325fcSPaul Mullowney 
1265bda325fcSPaul Mullowney   PetscFunctionBegin;
1266aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1267aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12689566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1269aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1270aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1271bda325fcSPaul Mullowney   }
1272bda325fcSPaul Mullowney 
1273bda325fcSPaul Mullowney   /* Get the GPU pointers */
12749566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12759566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1276bda325fcSPaul Mullowney 
12779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1278aa372e3fSPaul Mullowney   /* First, solve U */
12799371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1281afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1282afb2bd1cSJunchao Zhang #endif
12839371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
12841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12859371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12869371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1287d49cd2b7SBarry Smith #else
12889371c9d4SSatish Balay                               tempGPU->data().get());
12899371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1290afb2bd1cSJunchao Zhang #endif
1291aa372e3fSPaul Mullowney 
1292aa372e3fSPaul Mullowney   /* Then, solve L */
12939371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1295afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1296afb2bd1cSJunchao Zhang #endif
12979371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1298d49cd2b7SBarry Smith                               tempGPU->data().get(),
12991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13009371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
13019371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1302d49cd2b7SBarry Smith #else
13039371c9d4SSatish Balay                               xarray);
13049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1305afb2bd1cSJunchao Zhang #endif
1306bda325fcSPaul Mullowney 
1307bda325fcSPaul Mullowney   /* restore */
13089566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13099566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1312bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1313bda325fcSPaul Mullowney }
1314bda325fcSPaul Mullowney 
13159371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1316465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1317465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1318465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1319465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13209ae82921SPaul Mullowney   cusparseStatus_t                      stat;
13219ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1322aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1323aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1324aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13259ae82921SPaul Mullowney 
13269ae82921SPaul Mullowney   PetscFunctionBegin;
1327ebc8f436SDominic Meiser 
1328e057df02SPaul Mullowney   /* Get the GPU pointers */
13299566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13309566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1331c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1332c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13339ae82921SPaul Mullowney 
13349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1335aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13369371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1337aa372e3fSPaul Mullowney 
1338aa372e3fSPaul Mullowney   /* Next, solve L */
13399371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1341afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1342afb2bd1cSJunchao Zhang #endif
13439371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1344d49cd2b7SBarry Smith                               tempGPU->data().get(),
13451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13469371c9d4SSatish Balay                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13479371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1348d49cd2b7SBarry Smith #else
13499371c9d4SSatish Balay                               xarray);
13509371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1351afb2bd1cSJunchao Zhang #endif
1352aa372e3fSPaul Mullowney 
1353aa372e3fSPaul Mullowney   /* Then, solve U */
13549371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
13551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1356afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1357afb2bd1cSJunchao Zhang #endif
13589371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
13591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13609371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
13619371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1362d49cd2b7SBarry Smith #else
13639371c9d4SSatish Balay                               tempGPU->data().get());
13649371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1365afb2bd1cSJunchao Zhang #endif
1366d49cd2b7SBarry Smith 
13674e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13689371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13699ae82921SPaul Mullowney 
13709566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13719566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13749ae82921SPaul Mullowney   PetscFunctionReturn(0);
13759ae82921SPaul Mullowney }
13769ae82921SPaul Mullowney 
13779371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1378465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1379465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13809ae82921SPaul Mullowney   cusparseStatus_t                   stat;
13819ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1382aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1383aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1384aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13859ae82921SPaul Mullowney 
13869ae82921SPaul Mullowney   PetscFunctionBegin;
1387e057df02SPaul Mullowney   /* Get the GPU pointers */
13889566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13899566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13909ae82921SPaul Mullowney 
13919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1392aa372e3fSPaul Mullowney   /* First, solve L */
13939371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1395afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1396afb2bd1cSJunchao Zhang #endif
13979371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
13981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13999371c9d4SSatish Balay                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
14009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1401d49cd2b7SBarry Smith #else
14029371c9d4SSatish Balay                               tempGPU->data().get());
14039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1404afb2bd1cSJunchao Zhang #endif
1405d49cd2b7SBarry Smith 
1406aa372e3fSPaul Mullowney   /* Next, solve U */
14079371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
14081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1409afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1410afb2bd1cSJunchao Zhang #endif
14119371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1412d49cd2b7SBarry Smith                               tempGPU->data().get(),
14131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
14149371c9d4SSatish Balay                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
14159371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1416d49cd2b7SBarry Smith #else
14179371c9d4SSatish Balay                               xarray);
14189371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1419afb2bd1cSJunchao Zhang #endif
14209ae82921SPaul Mullowney 
14219566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
14229566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
14239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
14259ae82921SPaul Mullowney   PetscFunctionReturn(0);
14269ae82921SPaul Mullowney }
14279ae82921SPaul Mullowney 
1428da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1429da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
14309371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1431da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1432da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1433da112707SJunchao Zhang   const PetscScalar            *barray;
1434da112707SJunchao Zhang   PetscScalar                  *xarray;
1435da112707SJunchao Zhang 
1436da112707SJunchao Zhang   PetscFunctionBegin;
1437da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1438da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1439da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1440da112707SJunchao Zhang 
1441da112707SJunchao Zhang   /* Solve L*y = b */
1442da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1443da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14449371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
14459371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
144612ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1447da112707SJunchao Zhang 
1448da112707SJunchao Zhang   /* Solve U*x = y */
1449da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14509371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
14519371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1452da112707SJunchao Zhang 
1453da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1454da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1455da112707SJunchao Zhang 
1456da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1457da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1458da112707SJunchao Zhang   PetscFunctionReturn(0);
1459da112707SJunchao Zhang }
1460da112707SJunchao Zhang 
14619371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1462da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1463da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1464da112707SJunchao Zhang   const PetscScalar            *barray;
1465da112707SJunchao Zhang   PetscScalar                  *xarray;
1466da112707SJunchao Zhang 
1467da112707SJunchao Zhang   PetscFunctionBegin;
146812ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1469da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
14709371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
14719371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1472da112707SJunchao Zhang 
1473da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
14749371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1475da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
147612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
147712ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
147812ba2bc6SJunchao Zhang   }
1479da112707SJunchao Zhang 
148012ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14819371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1482da112707SJunchao Zhang 
14839371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
148412ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1485da112707SJunchao Zhang   }
1486da112707SJunchao Zhang 
1487da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1488da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1489da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1490da112707SJunchao Zhang 
1491da112707SJunchao Zhang   /* Solve Ut*y = b */
1492da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1493da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14949371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14959371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1496da112707SJunchao Zhang 
1497da112707SJunchao Zhang   /* Solve Lt*x = y */
1498da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14999371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
15009371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1501da112707SJunchao Zhang 
1502da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1503da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1504da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1505da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1506da112707SJunchao Zhang   PetscFunctionReturn(0);
1507da112707SJunchao Zhang }
1508da112707SJunchao Zhang 
15099371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1510da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1511da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1512da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1513da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1514da112707SJunchao Zhang   PetscInt                      m, nz;
1515da112707SJunchao Zhang   PetscBool                     flg;
1516da112707SJunchao Zhang 
1517da112707SJunchao Zhang   PetscFunctionBegin;
1518da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1519da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1520da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1521da112707SJunchao Zhang   }
1522da112707SJunchao Zhang 
1523da112707SJunchao Zhang   /* Copy A's value to fact */
1524da112707SJunchao Zhang   m  = fact->rmap->n;
1525da112707SJunchao Zhang   nz = aij->nz;
1526da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1527da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1528da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1529da112707SJunchao Zhang 
1530da112707SJunchao Zhang   /* Factorize fact inplace */
15319371c9d4SSatish Balay   if (m)
15329371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15339371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1534da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1535da112707SJunchao Zhang     int              numerical_zero;
1536da112707SJunchao Zhang     cusparseStatus_t status;
1537da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1538da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1539da112707SJunchao Zhang   }
1540da112707SJunchao Zhang 
154112ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
154212ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
154312ba2bc6SJunchao Zhang   */
15449371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1545da112707SJunchao Zhang 
15469371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1547da112707SJunchao Zhang 
154812ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
154912ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
155012ba2bc6SJunchao Zhang 
1551da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1552da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1553da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1554da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1555da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1556da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1557da112707SJunchao Zhang   PetscFunctionReturn(0);
1558da112707SJunchao Zhang }
1559da112707SJunchao Zhang 
15609371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1561da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1562da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1563da112707SJunchao Zhang   PetscInt                      m, nz;
1564da112707SJunchao Zhang 
1565da112707SJunchao Zhang   PetscFunctionBegin;
1566da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1567da112707SJunchao Zhang     PetscInt  i;
1568da112707SJunchao Zhang     PetscBool flg, missing;
1569da112707SJunchao Zhang 
1570da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1571da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1572da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1573da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1574da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1575da112707SJunchao Zhang   }
1576da112707SJunchao Zhang 
1577da112707SJunchao Zhang   /* Free the old stale stuff */
1578da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1579da112707SJunchao Zhang 
1580da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1581da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1582da112707SJunchao Zhang    */
1583da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1584da112707SJunchao Zhang 
1585da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1586da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1587da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1588da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1589da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1590da112707SJunchao Zhang 
1591da112707SJunchao Zhang   aij->row = NULL;
1592da112707SJunchao Zhang   aij->col = NULL;
1593da112707SJunchao Zhang 
1594da112707SJunchao Zhang   /* ====================================================================== */
1595da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1596da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1597da112707SJunchao Zhang   /* ====================================================================== */
1598da112707SJunchao Zhang   const int *Ai, *Aj;
1599da112707SJunchao Zhang 
1600da112707SJunchao Zhang   m  = fact->rmap->n;
1601da112707SJunchao Zhang   nz = aij->nz;
1602da112707SJunchao Zhang 
1603da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1604da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1605da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1606da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1607da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1608da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1609da112707SJunchao Zhang 
1610da112707SJunchao Zhang   /* ====================================================================== */
1611da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1612da112707SJunchao Zhang   /* ====================================================================== */
1613da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1614da112707SJunchao Zhang   cusparseDiagType_t diagType;
1615da112707SJunchao Zhang 
1616da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1617da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1618da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1619da112707SJunchao Zhang 
1620da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1621da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1622da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1623da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1624da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1625da112707SJunchao Zhang   */
1626da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1627da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
16289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16299371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1631da112707SJunchao Zhang 
1632da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1633da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
16349371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16369371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1637da112707SJunchao Zhang 
1638da112707SJunchao Zhang   /* ========================================================================= */
1639da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1640da112707SJunchao Zhang   /* ========================================================================= */
1641da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
16429371c9d4SSatish Balay   if (m)
16439371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16449371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1645da112707SJunchao Zhang 
1646da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1647da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1648da112707SJunchao Zhang 
1649da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1650da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1651da112707SJunchao Zhang 
1652da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
16539371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1654da112707SJunchao Zhang 
1655da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
16569371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1657da112707SJunchao Zhang 
1658da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
165912ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
166012ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
166112ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1662da112707SJunchao Zhang    */
166312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
166412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
166512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1666da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
166712ba2bc6SJunchao Zhang   } else {
166812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
166912ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1670da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
167112ba2bc6SJunchao Zhang   }
1672da112707SJunchao Zhang 
1673da112707SJunchao Zhang   /* ========================================================================== */
1674da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1675da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1676da112707SJunchao Zhang   /* ========================================================================== */
1677da112707SJunchao Zhang   int              structural_zero;
1678da112707SJunchao Zhang   cusparseStatus_t status;
1679da112707SJunchao Zhang 
1680da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16819371c9d4SSatish Balay   if (m)
16829371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16839371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1684da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1685da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1686da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1687da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1688da112707SJunchao Zhang   }
1689da112707SJunchao Zhang 
1690da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16910dd8c0acSJunchao Zhang   {
1692da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16930dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1694da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1695da112707SJunchao Zhang 
1696da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1697da112707SJunchao Zhang     Ai    = Aseq->i;
1698da112707SJunchao Zhang     Adiag = Aseq->diag;
1699da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1700da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1701da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1702da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1703da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1704da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1705da112707SJunchao Zhang         */
1706da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1707da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1708da112707SJunchao Zhang       }
1709da112707SJunchao Zhang     }
1710da112707SJunchao Zhang     fs->numericFactFlops = flops;
17110dd8c0acSJunchao Zhang   }
1712da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1713da112707SJunchao Zhang   PetscFunctionReturn(0);
1714da112707SJunchao Zhang }
1715da112707SJunchao Zhang 
17169371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1717da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1718da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1719da112707SJunchao Zhang   const PetscScalar            *barray;
1720da112707SJunchao Zhang   PetscScalar                  *xarray;
1721da112707SJunchao Zhang 
1722da112707SJunchao Zhang   PetscFunctionBegin;
1723da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1724da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1725da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1726da112707SJunchao Zhang 
1727da112707SJunchao Zhang   /* Solve L*y = b */
1728da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1729da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
17309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
17319371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1732da112707SJunchao Zhang 
1733da112707SJunchao Zhang   /* Solve Lt*x = y */
1734da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
17359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
17369371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1737da112707SJunchao Zhang 
1738da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1739da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1740da112707SJunchao Zhang 
1741da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1742da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1743da112707SJunchao Zhang   PetscFunctionReturn(0);
1744da112707SJunchao Zhang }
1745da112707SJunchao Zhang 
17469371c9d4SSatish Balay static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1747da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1748da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1749da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1750da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1751da112707SJunchao Zhang   PetscInt                      m, nz;
1752da112707SJunchao Zhang   PetscBool                     flg;
1753da112707SJunchao Zhang 
1754da112707SJunchao Zhang   PetscFunctionBegin;
1755da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1756da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1757da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1758da112707SJunchao Zhang   }
1759da112707SJunchao Zhang 
1760da112707SJunchao Zhang   /* Copy A's value to fact */
1761da112707SJunchao Zhang   m  = fact->rmap->n;
1762da112707SJunchao Zhang   nz = aij->nz;
1763da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1764da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1765da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1766da112707SJunchao Zhang 
1767da112707SJunchao Zhang   /* Factorize fact inplace */
1768da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1769da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1770da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1771da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1772da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1773da112707SJunchao Zhang    */
17749371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1775da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1776da112707SJunchao Zhang     int              numerical_zero;
1777da112707SJunchao Zhang     cusparseStatus_t status;
1778da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1779da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1780da112707SJunchao Zhang   }
1781da112707SJunchao Zhang 
17829371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1783da112707SJunchao Zhang 
1784da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1785da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1786da112707SJunchao Zhang   */
17879371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1788da112707SJunchao Zhang 
1789da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1790da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1791da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1792da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1793da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1794da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1795da112707SJunchao Zhang   PetscFunctionReturn(0);
1796da112707SJunchao Zhang }
1797da112707SJunchao Zhang 
17989371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1799da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1800da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1801da112707SJunchao Zhang   PetscInt                      m, nz;
1802da112707SJunchao Zhang 
1803da112707SJunchao Zhang   PetscFunctionBegin;
1804da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1805da112707SJunchao Zhang     PetscInt  i;
1806da112707SJunchao Zhang     PetscBool flg, missing;
1807da112707SJunchao Zhang 
1808da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1809da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1810da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1811da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1812da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1813da112707SJunchao Zhang   }
1814da112707SJunchao Zhang 
1815da112707SJunchao Zhang   /* Free the old stale stuff */
1816da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1817da112707SJunchao Zhang 
1818da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1819da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1820da112707SJunchao Zhang    */
1821da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1822da112707SJunchao Zhang 
1823da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1824da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1825da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1826da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1827da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1828da112707SJunchao Zhang 
1829da112707SJunchao Zhang   aij->row = NULL;
1830da112707SJunchao Zhang   aij->col = NULL;
1831da112707SJunchao Zhang 
1832da112707SJunchao Zhang   /* ====================================================================== */
1833da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1834da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1835da112707SJunchao Zhang   /* ====================================================================== */
1836da112707SJunchao Zhang   const int *Ai, *Aj;
1837da112707SJunchao Zhang 
1838da112707SJunchao Zhang   m  = fact->rmap->n;
1839da112707SJunchao Zhang   nz = aij->nz;
1840da112707SJunchao Zhang 
1841da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1842da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1843da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1844da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1845da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1846da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1847da112707SJunchao Zhang 
1848da112707SJunchao Zhang   /* ====================================================================== */
1849da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1850da112707SJunchao Zhang   /* ====================================================================== */
1851da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1852da112707SJunchao Zhang   cusparseDiagType_t diagType;
1853da112707SJunchao Zhang 
1854da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1855da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1856da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1857da112707SJunchao Zhang 
1858da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1859da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1860da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1861da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1862da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1863da112707SJunchao Zhang   */
1864da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1865da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
18669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18679371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18689371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1869da112707SJunchao Zhang 
1870da112707SJunchao Zhang   /* ========================================================================= */
1871da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1872da112707SJunchao Zhang   /* ========================================================================= */
1873da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18749371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1875da112707SJunchao Zhang 
1876da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1877da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1878da112707SJunchao Zhang 
1879da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1880da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1881da112707SJunchao Zhang 
1882da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18839371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1884da112707SJunchao Zhang 
1885da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18869371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1887da112707SJunchao Zhang 
188812ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
188912ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
189012ba2bc6SJunchao Zhang    */
189112ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
189212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
189312ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1894da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
189512ba2bc6SJunchao Zhang   } else {
189612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
189712ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
189812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
189912ba2bc6SJunchao Zhang   }
1900da112707SJunchao Zhang 
1901da112707SJunchao Zhang   /* ========================================================================== */
1902da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1903da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1904da112707SJunchao Zhang   /* ========================================================================== */
1905da112707SJunchao Zhang   int              structural_zero;
1906da112707SJunchao Zhang   cusparseStatus_t status;
1907da112707SJunchao Zhang 
1908da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
19099371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1910da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1911da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1912da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1913da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1914da112707SJunchao Zhang   }
1915da112707SJunchao Zhang 
1916da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19170dd8c0acSJunchao Zhang   {
1918da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19190dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1920da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1921da112707SJunchao Zhang 
1922da112707SJunchao Zhang     Ai = Aseq->i;
1923da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1924da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1925da112707SJunchao Zhang       if (nzRow > 1) {
1926da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1927da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1928da112707SJunchao Zhang         */
1929da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1930da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1931da112707SJunchao Zhang       }
1932da112707SJunchao Zhang     }
1933da112707SJunchao Zhang     fs->numericFactFlops = flops;
19340dd8c0acSJunchao Zhang   }
1935da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1936da112707SJunchao Zhang   PetscFunctionReturn(0);
1937da112707SJunchao Zhang }
1938da112707SJunchao Zhang #endif
1939da112707SJunchao Zhang 
19409371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1941da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1942da112707SJunchao Zhang 
1943da112707SJunchao Zhang   PetscFunctionBegin;
1944da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1945bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1946bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1947da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1948da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1949bc996fdcSJunchao Zhang   }
1950da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1951da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1952da112707SJunchao Zhang   } else
1953da112707SJunchao Zhang #endif
1954da112707SJunchao Zhang   {
1955da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1956da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1957da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1958da112707SJunchao Zhang   }
1959da112707SJunchao Zhang   PetscFunctionReturn(0);
1960da112707SJunchao Zhang }
1961da112707SJunchao Zhang 
19629371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1963da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1964da112707SJunchao Zhang 
1965da112707SJunchao Zhang   PetscFunctionBegin;
1966da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1967da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1968da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1969da112707SJunchao Zhang   PetscFunctionReturn(0);
1970da112707SJunchao Zhang }
1971da112707SJunchao Zhang 
19729371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1973da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1974da112707SJunchao Zhang 
1975da112707SJunchao Zhang   PetscFunctionBegin;
1976da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1977bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1978bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1979da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1980da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1981da112707SJunchao Zhang   } else
1982da112707SJunchao Zhang #endif
1983da112707SJunchao Zhang   {
1984da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1985da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1986da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1987da112707SJunchao Zhang   }
1988da112707SJunchao Zhang   PetscFunctionReturn(0);
1989da112707SJunchao Zhang }
1990da112707SJunchao Zhang 
19919371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1992da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1993da112707SJunchao Zhang 
1994da112707SJunchao Zhang   PetscFunctionBegin;
1995da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1996da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1997da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1998da112707SJunchao Zhang   PetscFunctionReturn(0);
1999da112707SJunchao Zhang }
2000da112707SJunchao Zhang 
20019371c9d4SSatish Balay PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
2002841d4cb1SJunchao Zhang   PetscFunctionBegin;
2003841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
2004841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2005841d4cb1SJunchao Zhang }
2006841d4cb1SJunchao Zhang 
2007841d4cb1SJunchao Zhang /*MC
2008841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
200911a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2010841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2011841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
201211a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2013841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2014841d4cb1SJunchao Zhang 
2015841d4cb1SJunchao Zhang   Level: beginner
2016841d4cb1SJunchao Zhang 
201711a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2018841d4cb1SJunchao Zhang M*/
2019841d4cb1SJunchao Zhang 
20209371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2021841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2022bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2023bc996fdcSJunchao Zhang   char     *prefix;
2024bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2025841d4cb1SJunchao Zhang 
2026841d4cb1SJunchao Zhang   PetscFunctionBegin;
2027841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2028841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2029841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2030841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2031841d4cb1SJunchao Zhang 
2032bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2033bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2034bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2035bc996fdcSJunchao Zhang   PetscOptionsEnd();
2036bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2037bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2038bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2039bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2040bc996fdcSJunchao Zhang 
2041841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2042841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2043841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2044841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2045841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2046841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2047841d4cb1SJunchao Zhang     } else {
2048841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2049841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2050841d4cb1SJunchao Zhang     }
2051841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2052841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2053841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2054841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2055841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2056841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2057841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2058841d4cb1SJunchao Zhang     } else {
2059841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2060841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2061841d4cb1SJunchao Zhang     }
2062841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2063841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2064841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2065841d4cb1SJunchao Zhang 
2066841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2067841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2068841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2069841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2070841d4cb1SJunchao Zhang }
2071841d4cb1SJunchao Zhang 
20729371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
20737e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20747e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20750dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2076da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20770dd8c0acSJunchao Zhang #endif
20787e8381f9SStefano Zampini 
20797e8381f9SStefano Zampini   PetscFunctionBegin;
20807e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20819566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2082da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2083da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2085da112707SJunchao Zhang     }
2086da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2087da112707SJunchao Zhang     else if (fs->csrVal) {
2088da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2089da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2090da112707SJunchao Zhang     }
2091da112707SJunchao Zhang #endif
20929371c9d4SSatish Balay     else
20939371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20949566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20959566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20967e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20977e8381f9SStefano Zampini   }
20987e8381f9SStefano Zampini   PetscFunctionReturn(0);
20997e8381f9SStefano Zampini }
21007e8381f9SStefano Zampini 
21019371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
21027e8381f9SStefano Zampini   PetscFunctionBegin;
21039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
210467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
210567a45760SJunchao Zhang   PetscFunctionReturn(0);
210667a45760SJunchao Zhang }
210767a45760SJunchao Zhang 
21089371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
210967a45760SJunchao Zhang   PetscFunctionBegin;
21107e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
211167a45760SJunchao Zhang   *array         = NULL;
211267a45760SJunchao Zhang   PetscFunctionReturn(0);
211367a45760SJunchao Zhang }
211467a45760SJunchao Zhang 
21159371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
211667a45760SJunchao Zhang   PetscFunctionBegin;
21179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
211867a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
211967a45760SJunchao Zhang   PetscFunctionReturn(0);
212067a45760SJunchao Zhang }
212167a45760SJunchao Zhang 
21229371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
212367a45760SJunchao Zhang   PetscFunctionBegin;
212467a45760SJunchao Zhang   *array = NULL;
212567a45760SJunchao Zhang   PetscFunctionReturn(0);
212667a45760SJunchao Zhang }
212767a45760SJunchao Zhang 
21289371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
212967a45760SJunchao Zhang   PetscFunctionBegin;
213067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
213167a45760SJunchao Zhang   PetscFunctionReturn(0);
213267a45760SJunchao Zhang }
213367a45760SJunchao Zhang 
21349371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
213567a45760SJunchao Zhang   PetscFunctionBegin;
213667a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
213767a45760SJunchao Zhang   *array         = NULL;
21387e8381f9SStefano Zampini   PetscFunctionReturn(0);
21397e8381f9SStefano Zampini }
21407e8381f9SStefano Zampini 
21419371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
21427ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
21437ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
21447ee59b9bSJunchao Zhang 
21457ee59b9bSJunchao Zhang   PetscFunctionBegin;
21467ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
21477ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
21487ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
21497ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
21507ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
21517ee59b9bSJunchao Zhang 
21527ee59b9bSJunchao Zhang   if (i) {
21537ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21547ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
21557ee59b9bSJunchao Zhang #else
21567ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21577ee59b9bSJunchao Zhang #endif
21587ee59b9bSJunchao Zhang   }
21597ee59b9bSJunchao Zhang   if (j) {
21607ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21617ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21627ee59b9bSJunchao Zhang #else
21637ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21647ee59b9bSJunchao Zhang #endif
21657ee59b9bSJunchao Zhang   }
21667ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21677ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21687ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
21697ee59b9bSJunchao Zhang }
21707ee59b9bSJunchao Zhang 
21719371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2172aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21737c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21749ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2175213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2176aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2177abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21789ae82921SPaul Mullowney 
21799ae82921SPaul Mullowney   PetscFunctionBegin;
218028b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2181c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2182a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2183a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2184afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
218585ba7357SStefano Zampini 
218608401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21879566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2188afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21899566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21909566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21919566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21929566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
219334d6c7a5SJose E. Roman     } else {
2194abb89eb1SStefano Zampini       PetscInt nnz;
21959566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21969566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21979566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21987c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
219981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2200a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2201a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
22029ae82921SPaul Mullowney       try {
22039ae82921SPaul Mullowney         if (a->compressedrow.use) {
22049ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
22059ae82921SPaul Mullowney           ii   = a->compressedrow.i;
22069ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
22079ae82921SPaul Mullowney         } else {
2208213423ffSJunchao Zhang           m    = A->rmap->n;
2209213423ffSJunchao Zhang           ii   = a->i;
2210e6e9a74fSStefano Zampini           ridx = NULL;
22119ae82921SPaul Mullowney         }
221208401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
22139371c9d4SSatish Balay         if (!a->a) {
22149371c9d4SSatish Balay           nnz  = ii[m];
22159371c9d4SSatish Balay           both = PETSC_FALSE;
22169371c9d4SSatish Balay         } else nnz = a->nz;
221708401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
22189ae82921SPaul Mullowney 
221985ba7357SStefano Zampini         /* create cusparse matrix */
2220abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2221aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
22229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
22239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
22249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
22259ae82921SPaul Mullowney 
22269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
22279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
22289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
22299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22309566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2233b06137fdSPaul Mullowney 
2234aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2235aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2236aa372e3fSPaul Mullowney           /* set the matrix */
2237afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2238afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2239afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2240abb89eb1SStefano Zampini           mat->num_entries = nnz;
2241afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2242afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
22439ae82921SPaul Mullowney 
2244abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2245abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2246aa372e3fSPaul Mullowney 
2247abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2248abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2249aa372e3fSPaul Mullowney 
2250aa372e3fSPaul Mullowney           /* assign the pointer */
2251afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2252afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2253afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
22549371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
22559371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22569371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2257afb2bd1cSJunchao Zhang           }
2258afb2bd1cSJunchao Zhang #endif
2259aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2260afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2261afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2262afb2bd1cSJunchao Zhang #else
2263afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2264afb2bd1cSJunchao Zhang           mat->num_rows = m;
2265afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2266abb89eb1SStefano Zampini           mat->num_entries = nnz;
2267afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2268afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2269aa372e3fSPaul Mullowney 
2270abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2271abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2272aa372e3fSPaul Mullowney 
2273abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2274abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2275aa372e3fSPaul Mullowney 
2276aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22779566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22789371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22799371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22809371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2281aa372e3fSPaul Mullowney           /* assign the pointer */
2282aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2283aa372e3fSPaul Mullowney 
2284afb2bd1cSJunchao Zhang           if (mat) {
2285afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2286afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2287afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2288afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2289087f3262SPaul Mullowney           }
2290afb2bd1cSJunchao Zhang #endif
2291087f3262SPaul Mullowney         }
2292ca45077fSPaul Mullowney 
2293aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2294213423ffSJunchao Zhang         if (a->compressedrow.use) {
2295213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2296aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2297aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2298213423ffSJunchao Zhang           tmp = m;
2299213423ffSJunchao Zhang         } else {
2300213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2301213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2302213423ffSJunchao Zhang           tmp                        = 0;
2303213423ffSJunchao Zhang         }
23049566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2305aa372e3fSPaul Mullowney 
2306aa372e3fSPaul Mullowney         /* assign the pointer */
2307aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
23089371c9d4SSatish Balay       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
23099566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
23109566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
231134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
231234d6c7a5SJose E. Roman     }
2313abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
23149ae82921SPaul Mullowney   }
23159ae82921SPaul Mullowney   PetscFunctionReturn(0);
23169ae82921SPaul Mullowney }
23179ae82921SPaul Mullowney 
23189371c9d4SSatish Balay struct VecCUDAPlusEquals {
2319aa372e3fSPaul Mullowney   template <typename Tuple>
23209371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2321aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2322aa372e3fSPaul Mullowney   }
2323aa372e3fSPaul Mullowney };
2324aa372e3fSPaul Mullowney 
23259371c9d4SSatish Balay struct VecCUDAEquals {
23267e8381f9SStefano Zampini   template <typename Tuple>
23279371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
23287e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
23297e8381f9SStefano Zampini   }
23307e8381f9SStefano Zampini };
23317e8381f9SStefano Zampini 
23329371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2333e6e9a74fSStefano Zampini   template <typename Tuple>
23349371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2335e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2336e6e9a74fSStefano Zampini   }
2337e6e9a74fSStefano Zampini };
2338e6e9a74fSStefano Zampini 
2339afb2bd1cSJunchao Zhang struct MatMatCusparse {
2340ccdfe979SStefano Zampini   PetscBool      cisdense;
2341ccdfe979SStefano Zampini   PetscScalar   *Bt;
2342ccdfe979SStefano Zampini   Mat            X;
2343fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2344fcdce8c4SStefano Zampini   PetscLogDouble flops;
2345fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2346b4285af6SJunchao Zhang 
2347afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2348fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2349afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2350afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2351afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2352afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2353b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354b4285af6SJunchao Zhang   void *dBuffer4;
2355b4285af6SJunchao Zhang   void *dBuffer5;
2356b4285af6SJunchao Zhang #endif
2357fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2358fcdce8c4SStefano Zampini   void                 *mmBuffer;
2359fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2360fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2361afb2bd1cSJunchao Zhang #endif
2362afb2bd1cSJunchao Zhang };
2363ccdfe979SStefano Zampini 
23649371c9d4SSatish Balay static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2365ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2366ccdfe979SStefano Zampini 
2367ccdfe979SStefano Zampini   PetscFunctionBegin;
23689566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2369fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2370afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23719566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23729566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23739566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23749566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2375b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23769566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23779566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2378b4285af6SJunchao Zhang #endif
23799566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23809566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2381afb2bd1cSJunchao Zhang #endif
23829566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23839566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2384ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2385ccdfe979SStefano Zampini }
2386ccdfe979SStefano Zampini 
2387ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2388ccdfe979SStefano Zampini 
23899371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2390ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2391ccdfe979SStefano Zampini   Mat                           A, B;
2392afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2393ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2394ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2395ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2396ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2397ccdfe979SStefano Zampini   const PetscScalar            *barray;
2398ccdfe979SStefano Zampini   PetscScalar                  *carray;
2399ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2400ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2401ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2402ccdfe979SStefano Zampini 
2403ccdfe979SStefano Zampini   PetscFunctionBegin;
2404ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
240528b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2406ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2407ccdfe979SStefano Zampini   A      = product->A;
2408ccdfe979SStefano Zampini   B      = product->B;
24099566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
241028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2411ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2412ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
241328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
24149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2415ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2416ccdfe979SStefano Zampini   switch (product->type) {
2417ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2418ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2419ccdfe979SStefano Zampini     mat = cusp->mat;
2420ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2421ccdfe979SStefano Zampini     m   = A->rmap->n;
2422ccdfe979SStefano Zampini     n   = B->cmap->n;
2423ccdfe979SStefano Zampini     break;
2424ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
24251a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2426e6e9a74fSStefano Zampini       mat = cusp->mat;
2427e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2428e6e9a74fSStefano Zampini     } else {
24299566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2430ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2431ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2432e6e9a74fSStefano Zampini     }
2433ccdfe979SStefano Zampini     m = A->cmap->n;
2434ccdfe979SStefano Zampini     n = B->cmap->n;
2435ccdfe979SStefano Zampini     break;
2436ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2437ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2438ccdfe979SStefano Zampini     mat = cusp->mat;
2439ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2440ccdfe979SStefano Zampini     m   = A->rmap->n;
2441ccdfe979SStefano Zampini     n   = B->rmap->n;
2442ccdfe979SStefano Zampini     break;
24439371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2444ccdfe979SStefano Zampini   }
244528b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2446ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2447ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24489566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24499566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
24509566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2451afb2bd1cSJunchao Zhang 
24529566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2453c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
24549566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
24559566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2456c8378d12SStefano Zampini   } else {
24579566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
24589566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2459c8378d12SStefano Zampini   }
2460c8378d12SStefano Zampini 
24619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2462afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2463afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2464a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2465afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2466fcdce8c4SStefano Zampini     size_t mmBufferSize;
24679371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24689371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24699371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24709371c9d4SSatish Balay     }
2471afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24729566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2473afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2474afb2bd1cSJunchao Zhang     }
2475c8378d12SStefano Zampini 
24769371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24779371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24789371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24799371c9d4SSatish Balay     }
2480afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24819566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2482afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2483afb2bd1cSJunchao Zhang     }
2484afb2bd1cSJunchao Zhang 
2485afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24869371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24879371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24889371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2489afb2bd1cSJunchao Zhang     }
24909371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24919371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2492fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24949566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2495fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2496fcdce8c4SStefano Zampini     }
2497afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2498afb2bd1cSJunchao Zhang   } else {
2499afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
25009566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
25019566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
25029566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2503afb2bd1cSJunchao Zhang   }
2504afb2bd1cSJunchao Zhang 
2505afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
25069371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
25079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2508afb2bd1cSJunchao Zhang #else
2509afb2bd1cSJunchao Zhang   PetscInt k;
2510afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2511ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2512ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2513ccdfe979SStefano Zampini     cublasStatus_t cerr;
2514ccdfe979SStefano Zampini 
25159566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
25169371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
25179371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2518ccdfe979SStefano Zampini     blda = B->cmap->n;
2519afb2bd1cSJunchao Zhang     k = B->cmap->n;
2520afb2bd1cSJunchao Zhang   } else {
2521afb2bd1cSJunchao Zhang     k = B->rmap->n;
2522ccdfe979SStefano Zampini   }
2523ccdfe979SStefano Zampini 
2524afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
25259371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
25269371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2527afb2bd1cSJunchao Zhang #endif
25289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
25299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
25309566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2531ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
25329566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25339566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2534ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
25359566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25369566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2537ccdfe979SStefano Zampini   } else {
25389566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2539ccdfe979SStefano Zampini   }
254048a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
254148a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2542ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2543ccdfe979SStefano Zampini }
2544ccdfe979SStefano Zampini 
25459371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2546ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2547ccdfe979SStefano Zampini   Mat                 A, B;
2548ccdfe979SStefano Zampini   PetscInt            m, n;
2549ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2550ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2551ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2552ccdfe979SStefano Zampini 
2553ccdfe979SStefano Zampini   PetscFunctionBegin;
2554ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
255528b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2556ccdfe979SStefano Zampini   A = product->A;
2557ccdfe979SStefano Zampini   B = product->B;
25589566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
255928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2560ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
256108401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2562ccdfe979SStefano Zampini   switch (product->type) {
2563ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2564ccdfe979SStefano Zampini     m = A->rmap->n;
2565ccdfe979SStefano Zampini     n = B->cmap->n;
2566ccdfe979SStefano Zampini     break;
2567ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2568ccdfe979SStefano Zampini     m = A->cmap->n;
2569ccdfe979SStefano Zampini     n = B->cmap->n;
2570ccdfe979SStefano Zampini     break;
2571ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2572ccdfe979SStefano Zampini     m = A->rmap->n;
2573ccdfe979SStefano Zampini     n = B->rmap->n;
2574ccdfe979SStefano Zampini     break;
2575ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2576ccdfe979SStefano Zampini     m = B->cmap->n;
2577ccdfe979SStefano Zampini     n = B->cmap->n;
2578ccdfe979SStefano Zampini     break;
2579ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2580ccdfe979SStefano Zampini     m = B->rmap->n;
2581ccdfe979SStefano Zampini     n = B->rmap->n;
2582ccdfe979SStefano Zampini     break;
25839371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2584ccdfe979SStefano Zampini   }
25859566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2586ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25879566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25889566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2589ccdfe979SStefano Zampini 
2590ccdfe979SStefano Zampini   /* product data */
25919566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2592ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2593afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2594afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
259548a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2596afb2bd1cSJunchao Zhang #endif
2597ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2598ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25999566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
26009566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2601ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
26029566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2603ccdfe979SStefano Zampini     } else {
26049566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2605ccdfe979SStefano Zampini     }
2606ccdfe979SStefano Zampini   }
2607ccdfe979SStefano Zampini   C->product->data    = mmdata;
2608ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2609ccdfe979SStefano Zampini 
2610ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2611ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2612ccdfe979SStefano Zampini }
2613ccdfe979SStefano Zampini 
26149371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2615ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2616fcdce8c4SStefano Zampini   Mat                           A, B;
2617fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2618fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2619fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2620fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2621fcdce8c4SStefano Zampini   PetscBool                     flg;
2622fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2623fcdce8c4SStefano Zampini   MatProductType                ptype;
2624fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2625fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2627fcdce8c4SStefano Zampini #endif
2628b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2629ccdfe979SStefano Zampini 
2630ccdfe979SStefano Zampini   PetscFunctionBegin;
2631ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
263228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
26339566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
263428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2635fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2636fcdce8c4SStefano Zampini   A      = product->A;
2637fcdce8c4SStefano Zampini   B      = product->B;
2638fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2639fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2640fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
264108401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2642fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
264328b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2644fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
264528b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2646fcdce8c4SStefano Zampini     goto finalize;
2647fcdce8c4SStefano Zampini   }
2648fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26499566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
265028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26519566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
265228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
265328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
265428b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2655fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2656fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2657fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
265808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
265908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
266008401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2663fcdce8c4SStefano Zampini 
2664fcdce8c4SStefano Zampini   ptype = product->type;
2665b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2666fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
266728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2668fa046f9fSJunchao Zhang   }
2669b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2670fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
267128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2672fa046f9fSJunchao Zhang   }
2673fcdce8c4SStefano Zampini   switch (ptype) {
2674fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2675fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2676fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2677fcdce8c4SStefano Zampini     break;
2678fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2679fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2680fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2681fcdce8c4SStefano Zampini     break;
2682fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2683fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2684fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2685fcdce8c4SStefano Zampini     break;
26869371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2687fcdce8c4SStefano Zampini   }
2688fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
268928b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
269028b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
269128b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2692fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2693fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2694fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
269528b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
269628b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
269728b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26989566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2699fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2700fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
27019566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2702b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
27039371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2705b4285af6SJunchao Zhang #else
27069371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
27079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
27089371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27099371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2710b4285af6SJunchao Zhang #endif
2711fcdce8c4SStefano Zampini #else
27129371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
27139371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
27149371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2715fcdce8c4SStefano Zampini #endif
27169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27179566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
27189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2719fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2720fcdce8c4SStefano Zampini finalize:
2721fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
27229566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
27239566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
27249566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2725fcdce8c4SStefano Zampini   c->reallocs = 0;
2726fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2727fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2728fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2729fcdce8c4SStefano Zampini   C->num_ass++;
2730ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2731ccdfe979SStefano Zampini }
2732fcdce8c4SStefano Zampini 
27339371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2734fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2735fcdce8c4SStefano Zampini   Mat                           A, B;
2736fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2737fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2738fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2739fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2740fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2741fcdce8c4SStefano Zampini   PetscBool                     flg;
2742fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2743fcdce8c4SStefano Zampini   MatProductType                ptype;
2744fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2745fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2746fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2747fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2748fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2749fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2750fcdce8c4SStefano Zampini #else
2751fcdce8c4SStefano Zampini   int cnz;
2752fcdce8c4SStefano Zampini #endif
2753b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2754fcdce8c4SStefano Zampini 
2755fcdce8c4SStefano Zampini   PetscFunctionBegin;
2756fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
275728b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2758fcdce8c4SStefano Zampini   A = product->A;
2759fcdce8c4SStefano Zampini   B = product->B;
27609566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
276128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27629566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
276328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2764fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2765fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2766fcdce8c4SStefano Zampini   /* product data */
27679566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2768fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2769fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2770fcdce8c4SStefano Zampini 
27719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2773d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2774d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
277508401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
277608401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2777d60bce21SJunchao Zhang 
2778fcdce8c4SStefano Zampini   ptype = product->type;
2779b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2780fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2781fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2782fa046f9fSJunchao Zhang   }
2783b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2784fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2785fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2786fa046f9fSJunchao Zhang   }
2787fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2788fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2789fcdce8c4SStefano Zampini   switch (ptype) {
2790fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2791fcdce8c4SStefano Zampini     m    = A->rmap->n;
2792fcdce8c4SStefano Zampini     n    = B->cmap->n;
2793fcdce8c4SStefano Zampini     k    = A->cmap->n;
2794fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2795fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2796fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2797fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2798fcdce8c4SStefano Zampini     break;
2799fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2800fcdce8c4SStefano Zampini     m = A->cmap->n;
2801fcdce8c4SStefano Zampini     n = B->cmap->n;
2802fcdce8c4SStefano Zampini     k = A->rmap->n;
28039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2804fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2805fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2806fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2807fcdce8c4SStefano Zampini     break;
2808fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2809fcdce8c4SStefano Zampini     m = A->rmap->n;
2810fcdce8c4SStefano Zampini     n = B->rmap->n;
2811fcdce8c4SStefano Zampini     k = A->cmap->n;
28129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2813fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2814fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2815fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2816fcdce8c4SStefano Zampini     break;
28179371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2818fcdce8c4SStefano Zampini   }
2819fcdce8c4SStefano Zampini 
2820fcdce8c4SStefano Zampini   /* create cusparse matrix */
28219566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
28229566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2823fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2824fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2825fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2826fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2827fcdce8c4SStefano Zampini 
2828fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2829fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2830fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
28319566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
28329566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2833fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2834fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2835fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2836fcdce8c4SStefano Zampini   } else {
2837fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2838fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2839fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2840fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2841fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2842fcdce8c4SStefano Zampini   }
2843fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2844fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2845fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2846fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2847fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2848fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28529566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28539566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28549566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28559566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28569566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28579566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2858fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2859fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2860fcdce8c4SStefano Zampini     c->nz                = 0;
2861fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2862fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2863fcdce8c4SStefano Zampini     goto finalizesym;
2864fcdce8c4SStefano Zampini   }
2865fcdce8c4SStefano Zampini 
286628b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
286728b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2868fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2869fcdce8c4SStefano Zampini   if (!biscompressed) {
2870fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2871fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2872fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2873fcdce8c4SStefano Zampini #endif
2874fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2875fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2876fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2877fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2878fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2879fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2880fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2881fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2882fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2883fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2884fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28859566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2886fcdce8c4SStefano Zampini     }
2887fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2888fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2889fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2890fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28919371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28929371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2893fcdce8c4SStefano Zampini     }
2894fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2895fcdce8c4SStefano Zampini #endif
2896fcdce8c4SStefano Zampini   }
289728b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
289828b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2899fcdce8c4SStefano Zampini   /* precompute flops count */
2900fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2901fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2902fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2903fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2904fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2905fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2906fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2907fcdce8c4SStefano Zampini       }
2908fcdce8c4SStefano Zampini     }
2909fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2910fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2911fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2912fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2913fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2914fcdce8c4SStefano Zampini     }
2915fcdce8c4SStefano Zampini   } else { /* TODO */
2916fcdce8c4SStefano Zampini     flops = 0.;
2917fcdce8c4SStefano Zampini   }
2918fcdce8c4SStefano Zampini 
2919fcdce8c4SStefano Zampini   mmdata->flops = flops;
29209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2921b4285af6SJunchao Zhang 
2922fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29239566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
29249371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
29259371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29269566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2927b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2928b4285af6SJunchao Zhang   {
2929b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2930b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2931b4285af6SJunchao Zhang   */
2932b4285af6SJunchao Zhang     void  *dBuffer1    = NULL;
2933b4285af6SJunchao Zhang     void  *dBuffer2    = NULL;
2934b4285af6SJunchao Zhang     void  *dBuffer3    = NULL;
2935b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2936b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2937b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2938b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2939b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2940b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2941b4285af6SJunchao Zhang 
2942b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2943b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2947b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29489371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29499371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2950b4285af6SJunchao Zhang 
2951b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29529371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29539371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29579371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29589371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2961b4285af6SJunchao Zhang 
2962b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2963b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29649566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2965b4285af6SJunchao Zhang     c->nz                = (PetscInt)C_nnz1;
2966b4285af6SJunchao Zhang     /* allocate matrix C */
29679371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29689371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29699371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29709371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2971b4285af6SJunchao Zhang     /* update matC with the new pointers */
29729371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29739371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2974b4285af6SJunchao Zhang 
2975b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29769371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29779371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29789566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29799371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29809371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29829371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29839371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29849566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2985b4285af6SJunchao Zhang   }
2986ae37ee31SJunchao Zhang #else
2987b4285af6SJunchao Zhang   size_t bufSize2;
2988fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29899371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29909371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29919566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2992fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29939371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29949371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2995fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29969371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29979371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2998fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2999fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3000fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3001fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3002fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
30039566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3004fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
30059371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30069371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3007fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
30089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3009fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
30109371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
30119371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3012fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3014fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
30169371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
30179371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30189371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3020ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3021fcdce8c4SStefano Zampini #else
30229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
30239371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30249371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
30259371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3026fcdce8c4SStefano Zampini   c->nz = cnz;
3027fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30289566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3029fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30309566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3031fcdce8c4SStefano Zampini 
30329566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3033fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3034fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3035fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
30369371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30379371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30389371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3039fcdce8c4SStefano Zampini #endif
30409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3042fcdce8c4SStefano Zampini finalizesym:
3043fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3044fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3045fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3048fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3049fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3050fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3051fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3052fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3053fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3054fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3057fcdce8c4SStefano Zampini   } else {
3058fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3059fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3062fcdce8c4SStefano Zampini   }
3063fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3064fcdce8c4SStefano Zampini     PetscInt r = 0;
3065fcdce8c4SStefano Zampini     c->i[0]    = 0;
3066fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3067fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3068fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3069fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3070fcdce8c4SStefano Zampini     }
3071fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3072fcdce8c4SStefano Zampini   }
30739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30759566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3076fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3077fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3078fcdce8c4SStefano Zampini   c->rmax          = 0;
3079fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3080fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3081fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3082fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3083fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3084fcdce8c4SStefano Zampini   }
30859566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30869566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3087fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3088fcdce8c4SStefano Zampini 
3089fcdce8c4SStefano Zampini   C->nonzerostate++;
30909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30919566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3092fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3093fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3094fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3095fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3096fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3097abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3098fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3099fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3100fcdce8c4SStefano Zampini   }
3101fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3102fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3103fcdce8c4SStefano Zampini }
3104fcdce8c4SStefano Zampini 
3105fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3106fcdce8c4SStefano Zampini 
3107fcdce8c4SStefano Zampini /* handles sparse or dense B */
31089371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3109fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3110fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3111fcdce8c4SStefano Zampini 
3112fcdce8c4SStefano Zampini   PetscFunctionBegin;
3113fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
31149566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
311548a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3116fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3117fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
311848a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3119fcdce8c4SStefano Zampini   }
312065e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
312165e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
312265e4b4d4SStefano Zampini     switch (product->type) {
312365e4b4d4SStefano Zampini     case MATPRODUCT_AB:
312465e4b4d4SStefano Zampini       if (product->api_user) {
3125d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
31269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3127d0609cedSBarry Smith         PetscOptionsEnd();
312865e4b4d4SStefano Zampini       } else {
3129d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
31309566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3131d0609cedSBarry Smith         PetscOptionsEnd();
313265e4b4d4SStefano Zampini       }
313365e4b4d4SStefano Zampini       break;
313465e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
313565e4b4d4SStefano Zampini       if (product->api_user) {
3136d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
31379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3138d0609cedSBarry Smith         PetscOptionsEnd();
313965e4b4d4SStefano Zampini       } else {
3140d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
31419566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3142d0609cedSBarry Smith         PetscOptionsEnd();
314365e4b4d4SStefano Zampini       }
314465e4b4d4SStefano Zampini       break;
314565e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
314665e4b4d4SStefano Zampini       if (product->api_user) {
3147d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3149d0609cedSBarry Smith         PetscOptionsEnd();
315065e4b4d4SStefano Zampini       } else {
3151d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31529566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3153d0609cedSBarry Smith         PetscOptionsEnd();
315465e4b4d4SStefano Zampini       }
315565e4b4d4SStefano Zampini       break;
315665e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
315765e4b4d4SStefano Zampini       if (product->api_user) {
3158d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3160d0609cedSBarry Smith         PetscOptionsEnd();
316165e4b4d4SStefano Zampini       } else {
3162d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31639566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3164d0609cedSBarry Smith         PetscOptionsEnd();
316565e4b4d4SStefano Zampini       }
316665e4b4d4SStefano Zampini       break;
316765e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
316865e4b4d4SStefano Zampini       if (product->api_user) {
3169d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31709566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3171d0609cedSBarry Smith         PetscOptionsEnd();
317265e4b4d4SStefano Zampini       } else {
3173d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31749566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3175d0609cedSBarry Smith         PetscOptionsEnd();
317665e4b4d4SStefano Zampini       }
317765e4b4d4SStefano Zampini       break;
31789371c9d4SSatish Balay     default: break;
317965e4b4d4SStefano Zampini     }
318065e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
318165e4b4d4SStefano Zampini   }
318265e4b4d4SStefano Zampini   /* dispatch */
3183fcdce8c4SStefano Zampini   if (isdense) {
3184ccdfe979SStefano Zampini     switch (product->type) {
3185ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3186ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3187ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3188ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3189ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3190fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31919566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3192fcdce8c4SStefano Zampini       } else {
3193fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3194fcdce8c4SStefano Zampini       }
3195fcdce8c4SStefano Zampini       break;
31969371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
31979371c9d4SSatish Balay     default: break;
3198ccdfe979SStefano Zampini     }
3199fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3200fcdce8c4SStefano Zampini     switch (product->type) {
3201fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3202fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
32039371c9d4SSatish Balay     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3204fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3205fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
32069371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
32079371c9d4SSatish Balay     default: break;
3208fcdce8c4SStefano Zampini     }
3209fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
32109566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3211fcdce8c4SStefano Zampini   }
3212ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3213ccdfe979SStefano Zampini }
3214ccdfe979SStefano Zampini 
32159371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
32169ae82921SPaul Mullowney   PetscFunctionBegin;
32179566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3218e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3219e6e9a74fSStefano Zampini }
3220e6e9a74fSStefano Zampini 
32219371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3222e6e9a74fSStefano Zampini   PetscFunctionBegin;
32239566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3224e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3225e6e9a74fSStefano Zampini }
3226e6e9a74fSStefano Zampini 
32279371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3228e6e9a74fSStefano Zampini   PetscFunctionBegin;
32299566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3230e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3231e6e9a74fSStefano Zampini }
3232e6e9a74fSStefano Zampini 
32339371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3234e6e9a74fSStefano Zampini   PetscFunctionBegin;
32359566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32369ae82921SPaul Mullowney   PetscFunctionReturn(0);
32379ae82921SPaul Mullowney }
32389ae82921SPaul Mullowney 
32399371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3240ca45077fSPaul Mullowney   PetscFunctionBegin;
32419566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3242ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3243ca45077fSPaul Mullowney }
3244ca45077fSPaul Mullowney 
32459371c9d4SSatish Balay __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3246a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3247a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3248a0e72f99SJunchao Zhang }
3249a0e72f99SJunchao Zhang 
3250afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
32519371c9d4SSatish Balay static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
32529ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3253aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32549ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3255e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3256e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3257e6e9a74fSStefano Zampini   PetscBool                     compressed;
3258afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3259afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3260afb2bd1cSJunchao Zhang #endif
32616e111a19SKarl Rupp 
32629ae82921SPaul Mullowney   PetscFunctionBegin;
326308401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3264cbc6b225SStefano Zampini   if (!a->nz) {
32659566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
32669566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3267e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3268e6e9a74fSStefano Zampini   }
326934d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3271e6e9a74fSStefano Zampini   if (!trans) {
32729ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32735f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3274e6e9a74fSStefano Zampini   } else {
32751a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3276e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3277e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3278e6e9a74fSStefano Zampini     } else {
32799566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3280e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3281e6e9a74fSStefano Zampini     }
3282e6e9a74fSStefano Zampini   }
3283e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3284e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3285213423ffSJunchao Zhang 
3286e6e9a74fSStefano Zampini   try {
32879566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
32889566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
32899566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3290afb2bd1cSJunchao Zhang 
32919566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3292e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3293afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3294afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3295afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3296afb2bd1cSJunchao Zhang       */
3297e6e9a74fSStefano Zampini       xptr = xarray;
3298afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3299213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3300afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3301afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3302afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3303afb2bd1cSJunchao Zhang        */
3304afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3305afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3306afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3307afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3308afb2bd1cSJunchao Zhang       }
3309afb2bd1cSJunchao Zhang #endif
3310e6e9a74fSStefano Zampini     } else {
3311afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3312afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3313afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3314afb2bd1cSJunchao Zhang        */
3315afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3316e6e9a74fSStefano Zampini       dptr = zarray;
3317e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3318afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3319e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3320*d0967f54SJacob Faibussowitsch 
3321*d0967f54SJacob Faibussowitsch         thrust::for_each(
3322*d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3323*d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3324*d0967f54SJacob Faibussowitsch #endif
3325*d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
33269371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3327e6e9a74fSStefano Zampini       }
3328afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3329afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3330afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3331afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3332afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3333afb2bd1cSJunchao Zhang       }
3334afb2bd1cSJunchao Zhang #endif
3335e6e9a74fSStefano Zampini     }
33369ae82921SPaul Mullowney 
3337afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3338aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3339afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33405f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3341afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33429566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33439566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33449371c9d4SSatish Balay         PetscCallCUSPARSE(
33459371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33469566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3347afb2bd1cSJunchao Zhang 
3348afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3349afb2bd1cSJunchao Zhang       } else {
3350afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33519566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3353afb2bd1cSJunchao Zhang       }
3354afb2bd1cSJunchao Zhang 
33559371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33569371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3357afb2bd1cSJunchao Zhang #else
33587656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33599371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3360afb2bd1cSJunchao Zhang #endif
3361aa372e3fSPaul Mullowney     } else {
3362213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3363afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3364afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3365afb2bd1cSJunchao Zhang #else
3366301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33679371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3368afb2bd1cSJunchao Zhang #endif
3369a65300a6SPaul Mullowney       }
3370aa372e3fSPaul Mullowney     }
33719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3372aa372e3fSPaul Mullowney 
3373e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3374213423ffSJunchao Zhang       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3375213423ffSJunchao Zhang         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
33769566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3377e6e9a74fSStefano Zampini         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
33789566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
33797656d835SStefano Zampini         }
3380213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
33819566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz, 0));
33827656d835SStefano Zampini       }
33837656d835SStefano Zampini 
3384213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3385213423ffSJunchao Zhang       if (compressed) {
33869566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3387a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3388a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3389a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3390a0e72f99SJunchao Zhang          */
3391a0e72f99SJunchao Zhang #if 0
3392a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3393a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3394a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3395e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3396c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3397a0e72f99SJunchao Zhang #else
3398a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3399a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3400a0e72f99SJunchao Zhang #endif
34019566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3402e6e9a74fSStefano Zampini       }
3403e6e9a74fSStefano Zampini     } else {
34049371c9d4SSatish Balay       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3405e6e9a74fSStefano Zampini     }
34069566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
34079566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
34089566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
34099371c9d4SSatish Balay   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3410e6e9a74fSStefano Zampini   if (yy) {
34119566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3412e6e9a74fSStefano Zampini   } else {
34139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3414e6e9a74fSStefano Zampini   }
34159ae82921SPaul Mullowney   PetscFunctionReturn(0);
34169ae82921SPaul Mullowney }
34179ae82921SPaul Mullowney 
34189371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3419ca45077fSPaul Mullowney   PetscFunctionBegin;
34209566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3421ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3422ca45077fSPaul Mullowney }
3423ca45077fSPaul Mullowney 
34249371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3425042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3426042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34273fa6b06aSMark Adams 
3428042217e8SBarry Smith   PetscFunctionBegin;
34299566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3430042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34319566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3433042217e8SBarry Smith     cusp->deviceMat = NULL;
3434042217e8SBarry Smith   }
34359ae82921SPaul Mullowney   PetscFunctionReturn(0);
34369ae82921SPaul Mullowney }
34379ae82921SPaul Mullowney 
34389ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3439e057df02SPaul Mullowney /*@
344011a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3441e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
344211a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3443e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3444e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3445e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34469ae82921SPaul Mullowney 
3447d083f849SBarry Smith    Collective
34489ae82921SPaul Mullowney 
34499ae82921SPaul Mullowney    Input Parameters:
345011a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
34519ae82921SPaul Mullowney .  m - number of rows
34529ae82921SPaul Mullowney .  n - number of columns
34539ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34549ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34550298fd71SBarry Smith          (possibly different for each row) or NULL
34569ae82921SPaul Mullowney 
34579ae82921SPaul Mullowney    Output Parameter:
34589ae82921SPaul Mullowney .  A - the matrix
34599ae82921SPaul Mullowney 
346011a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
34619ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
346211a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
34639ae82921SPaul Mullowney 
34649ae82921SPaul Mullowney    Notes:
34659ae82921SPaul Mullowney    If nnz is given then nz is ignored
34669ae82921SPaul Mullowney 
346711a5261eSBarry Smith    The AIJ format, also called
346811a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
34699ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34709ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34719ae82921SPaul Mullowney 
34729ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
347311a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
34749ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34759ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34769ae82921SPaul Mullowney 
34779ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34789ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34799ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34809ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34819ae82921SPaul Mullowney 
34829ae82921SPaul Mullowney    Level: intermediate
34839ae82921SPaul Mullowney 
348411a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34859ae82921SPaul Mullowney @*/
34869371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
34879ae82921SPaul Mullowney   PetscFunctionBegin;
34889566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34899566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34909566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34929ae82921SPaul Mullowney   PetscFunctionReturn(0);
34939ae82921SPaul Mullowney }
34949ae82921SPaul Mullowney 
34959371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
34969ae82921SPaul Mullowney   PetscFunctionBegin;
34979ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34999ae82921SPaul Mullowney   } else {
35009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3501aa372e3fSPaul Mullowney   }
35029566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
35039566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
35049566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
35059566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
35069566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
35079566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
35089566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
35099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
35109566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
35119566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
35129566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
35139ae82921SPaul Mullowney   PetscFunctionReturn(0);
35149ae82921SPaul Mullowney }
35159ae82921SPaul Mullowney 
3516ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
351795639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
35189371c9d4SSatish Balay static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
35199ff858a8SKarl Rupp         PetscFunctionBegin;
35209566063dSJacob Faibussowitsch         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
35219566063dSJacob Faibussowitsch         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
35229ff858a8SKarl Rupp         PetscFunctionReturn(0);
35239ff858a8SKarl Rupp }
35249ff858a8SKarl Rupp 
35259371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3526a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3527039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3528039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3529039c6fbaSStefano Zampini   PetscScalar        *ay;
3530039c6fbaSStefano Zampini   const PetscScalar  *ax;
3531039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3532e6e9a74fSStefano Zampini 
353395639643SRichard Tran Mills   PetscFunctionBegin;
3534a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3535a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3536039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35389566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3539a587d139SMark     PetscFunctionReturn(0);
354095639643SRichard Tran Mills   }
3541039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35445f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35455f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3546039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3547039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3548039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3549039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3550039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3551ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3552039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3553039c6fbaSStefano Zampini   }
3554d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3555d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3556039c6fbaSStefano Zampini 
3557039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3558039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3559039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560039c6fbaSStefano Zampini     size_t bufferSize;
3561039c6fbaSStefano Zampini     void  *buffer;
3562039c6fbaSStefano Zampini #endif
3563039c6fbaSStefano Zampini 
35649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35669566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3567039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35689371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35699371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35729371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35739371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35749566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35759566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35769566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3577039c6fbaSStefano Zampini #else
35789566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35799371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35809371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35819566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35829566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3583039c6fbaSStefano Zampini #endif
35849566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35859566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3588039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3589a587d139SMark     cublasHandle_t cublasv2handle;
3590a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3591039c6fbaSStefano Zampini 
35929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35949566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35959566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35969566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35979566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35989566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35999566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
36009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
36019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3603039c6fbaSStefano Zampini   } else {
36049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
36059566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3606a587d139SMark   }
360795639643SRichard Tran Mills   PetscFunctionReturn(0);
360895639643SRichard Tran Mills }
360995639643SRichard Tran Mills 
36109371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
361133c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
361233c9ba73SStefano Zampini   PetscScalar   *ay;
361333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
361433c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
361533c9ba73SStefano Zampini 
361633c9ba73SStefano Zampini   PetscFunctionBegin;
36179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36189566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36199566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
36209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36219566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
362633c9ba73SStefano Zampini   PetscFunctionReturn(0);
362733c9ba73SStefano Zampini }
362833c9ba73SStefano Zampini 
36299371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
36307e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3631a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36327e8381f9SStefano Zampini 
36333fa6b06aSMark Adams   PetscFunctionBegin;
36343fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36353fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36367e8381f9SStefano Zampini     if (spptr->mat) {
36377e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36387e8381f9SStefano Zampini       if (matrix->values) {
36397e8381f9SStefano Zampini         both = PETSC_TRUE;
36407e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36417e8381f9SStefano Zampini       }
36427e8381f9SStefano Zampini     }
36437e8381f9SStefano Zampini     if (spptr->matTranspose) {
36447e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3645ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36467e8381f9SStefano Zampini     }
36473fa6b06aSMark Adams   }
36489566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36507e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3651a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36523fa6b06aSMark Adams   PetscFunctionReturn(0);
36533fa6b06aSMark Adams }
36543fa6b06aSMark Adams 
36559371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3656a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3657a587d139SMark 
3658a587d139SMark   PetscFunctionBegin;
36599a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36609a14fc28SStefano Zampini     A->boundtocpu = flg;
36619a14fc28SStefano Zampini     PetscFunctionReturn(0);
36629a14fc28SStefano Zampini   }
3663a587d139SMark   if (flg) {
36649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3665a587d139SMark 
366633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3667a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3668a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3669a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3670a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3671a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3672a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3673a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3674a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3675fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36769566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36779566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36789566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36799566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36809566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36819566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36829566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3683a587d139SMark   } else {
368433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3685a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3686a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3687a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3688a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3689a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3690a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3691a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3692a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3693fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
369467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
369567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
369667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
369767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
369867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
369967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
37007ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
37017ee59b9bSJunchao Zhang 
37029566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
37039566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
37049566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
37059566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
37069566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
37079566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3708a587d139SMark   }
3709a587d139SMark   A->boundtocpu = flg;
3710ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3711ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3712ea500dcfSRichard Tran Mills   } else {
3713ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3714ea500dcfSRichard Tran Mills   }
3715a587d139SMark   PetscFunctionReturn(0);
3716a587d139SMark }
3717a587d139SMark 
37189371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
371949735bf3SStefano Zampini   Mat B;
37209ae82921SPaul Mullowney 
37219ae82921SPaul Mullowney   PetscFunctionBegin;
37229566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
372349735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37249566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
372549735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37269566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
372749735bf3SStefano Zampini   }
372849735bf3SStefano Zampini   B = *newmat;
372949735bf3SStefano Zampini 
37309566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37319566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
373234136279SStefano Zampini 
373349735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37349ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3735e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37369566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37379566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37389566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37391a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3740d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3741ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
3742a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3743a435da06SStefano Zampini #else
3744d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3745a435da06SStefano Zampini #endif
3746d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3747d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3748d8132acaSStefano Zampini #endif
37491a2c6b5cSJunchao Zhang       B->spptr = spptr;
37509ae82921SPaul Mullowney     } else {
3751e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3752e6e9a74fSStefano Zampini 
37539566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37549566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37559566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3756e6e9a74fSStefano Zampini       B->spptr = spptr;
37579ae82921SPaul Mullowney     }
3758e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
375949735bf3SStefano Zampini   }
3760693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37619ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37621a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37639ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
376495639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3765693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37662205254eSKarl Rupp 
37679566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37689566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3770ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3772ae48a8d0SStefano Zampini #endif
37739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37749ae82921SPaul Mullowney   PetscFunctionReturn(0);
37759ae82921SPaul Mullowney }
37769ae82921SPaul Mullowney 
37779371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
377802fe1965SBarry Smith   PetscFunctionBegin;
37799566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37809566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
378102fe1965SBarry Smith   PetscFunctionReturn(0);
378202fe1965SBarry Smith }
378302fe1965SBarry Smith 
37843ca39a21SBarry Smith /*MC
3785e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3786e057df02SPaul Mullowney 
378711a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
378811a5261eSBarry Smith    CSR, ELL, or Hybrid format.
378911a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3790e057df02SPaul Mullowney 
3791e057df02SPaul Mullowney    Options Database Keys:
379211a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
379311a5261eSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
379411a5261eSBarry Smith -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
379511a5261eSBarry Smith +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3796e057df02SPaul Mullowney 
3797e057df02SPaul Mullowney   Level: beginner
3798e057df02SPaul Mullowney 
379911a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3800e057df02SPaul Mullowney M*/
38017f756511SDominic Meiser 
3802bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
38030f39cd5aSBarry Smith 
38049371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
380542c9c57cSBarry Smith   PetscFunctionBegin;
38069566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
38079566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
38089566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
38099566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
38109566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3811bddcd29dSMark Adams 
381242c9c57cSBarry Smith   PetscFunctionReturn(0);
381342c9c57cSBarry Smith }
381429b38603SBarry Smith 
38159371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3816cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3817cbc6b225SStefano Zampini 
3818cbc6b225SStefano Zampini   PetscFunctionBegin;
3819cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3820cbc6b225SStefano Zampini   delete cusp->cooPerm;
3821cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3822cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3823cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3824cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3827cbc6b225SStefano Zampini   }
3828cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3829cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3830cbc6b225SStefano Zampini }
3831cbc6b225SStefano Zampini 
38329371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
38337f756511SDominic Meiser   PetscFunctionBegin;
38347f756511SDominic Meiser   if (*cusparsestruct) {
38359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38377f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
383881902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38397e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38407e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3841a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38429566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38439566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38449566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38459566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38467f756511SDominic Meiser   }
38477f756511SDominic Meiser   PetscFunctionReturn(0);
38487f756511SDominic Meiser }
38497f756511SDominic Meiser 
38509371c9d4SSatish Balay static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
38517f756511SDominic Meiser   PetscFunctionBegin;
38527f756511SDominic Meiser   if (*mat) {
38537f756511SDominic Meiser     delete (*mat)->values;
38547f756511SDominic Meiser     delete (*mat)->column_indices;
38557f756511SDominic Meiser     delete (*mat)->row_offsets;
38567f756511SDominic Meiser     delete *mat;
38577f756511SDominic Meiser     *mat = 0;
38587f756511SDominic Meiser   }
38597f756511SDominic Meiser   PetscFunctionReturn(0);
38607f756511SDominic Meiser }
38617f756511SDominic Meiser 
38629371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
38637f756511SDominic Meiser   PetscFunctionBegin;
38647f756511SDominic Meiser   if (*trifactor) {
38659566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3866261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38679566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38689566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38699566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3870afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38719566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3872afb2bd1cSJunchao Zhang #endif
38739566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38747f756511SDominic Meiser   }
38757f756511SDominic Meiser   PetscFunctionReturn(0);
38767f756511SDominic Meiser }
38777f756511SDominic Meiser 
38789371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
38797f756511SDominic Meiser   CsrMatrix *mat;
38807f756511SDominic Meiser 
38817f756511SDominic Meiser   PetscFunctionBegin;
38827f756511SDominic Meiser   if (*matstruct) {
38837f756511SDominic Meiser     if ((*matstruct)->mat) {
38847f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3885afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3886afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3887afb2bd1cSJunchao Zhang #else
38887f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3890afb2bd1cSJunchao Zhang #endif
38917f756511SDominic Meiser       } else {
38927f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38937f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
38947f756511SDominic Meiser       }
38957f756511SDominic Meiser     }
38969566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38977f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38989566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38999566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
39009566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3901afb2bd1cSJunchao Zhang 
3902afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3903afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
39049566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3905afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3906afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
39079566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
39089566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
39099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3910afb2bd1cSJunchao Zhang       }
3911afb2bd1cSJunchao Zhang     }
3912afb2bd1cSJunchao Zhang #endif
39137f756511SDominic Meiser     delete *matstruct;
39147e8381f9SStefano Zampini     *matstruct = NULL;
39157f756511SDominic Meiser   }
39167f756511SDominic Meiser   PetscFunctionReturn(0);
39177f756511SDominic Meiser }
39187f756511SDominic Meiser 
39199371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3920da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3921da112707SJunchao Zhang 
39227f756511SDominic Meiser   PetscFunctionBegin;
3923da112707SJunchao Zhang   if (fs) {
3924da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3925da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3926da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3927da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3928da112707SJunchao Zhang     delete fs->rpermIndices;
3929da112707SJunchao Zhang     delete fs->cpermIndices;
3930da112707SJunchao Zhang     delete fs->workVector;
3931da112707SJunchao Zhang     fs->rpermIndices = NULL;
3932da112707SJunchao Zhang     fs->cpermIndices = NULL;
3933da112707SJunchao Zhang     fs->workVector   = NULL;
3934da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3935da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3936da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3937da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3938da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3939da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3940da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3941da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3942da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
394312ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3944da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3945da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
394612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3947da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3949da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3950da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3951da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3952da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3953da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3954da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3955da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3956da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3957da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3958da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
395912ba2bc6SJunchao Zhang 
396012ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
396112ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3962da112707SJunchao Zhang #endif
3963ccdfe979SStefano Zampini   }
3964ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3965ccdfe979SStefano Zampini }
3966ccdfe979SStefano Zampini 
39679371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3968ccdfe979SStefano Zampini   cusparseHandle_t handle;
3969ccdfe979SStefano Zampini 
3970ccdfe979SStefano Zampini   PetscFunctionBegin;
3971ccdfe979SStefano Zampini   if (*trifactors) {
39729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
397348a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
39749566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39757f756511SDominic Meiser   }
39767f756511SDominic Meiser   PetscFunctionReturn(0);
39777f756511SDominic Meiser }
39787e8381f9SStefano Zampini 
39799371c9d4SSatish Balay struct IJCompare {
39809371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39817e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39827e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39837e8381f9SStefano Zampini     return false;
39847e8381f9SStefano Zampini   }
39857e8381f9SStefano Zampini };
39867e8381f9SStefano Zampini 
39879371c9d4SSatish Balay struct IJEqual {
39889371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39897e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39907e8381f9SStefano Zampini     return true;
39917e8381f9SStefano Zampini   }
39927e8381f9SStefano Zampini };
39937e8381f9SStefano Zampini 
39949371c9d4SSatish Balay struct IJDiff {
39959371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39967e8381f9SStefano Zampini };
39977e8381f9SStefano Zampini 
39989371c9d4SSatish Balay struct IJSum {
39999371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
40007e8381f9SStefano Zampini };
40017e8381f9SStefano Zampini 
40027e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4003219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
40049371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
40057e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4006fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4007bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
400808391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
40097e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40107e8381f9SStefano Zampini   PetscInt                              n;
40117e8381f9SStefano Zampini 
40127e8381f9SStefano Zampini   PetscFunctionBegin;
401328b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
401428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40157e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40169566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40179566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40187e8381f9SStefano Zampini     PetscFunctionReturn(0);
40197e8381f9SStefano Zampini   }
40207e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
402128b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4022e61fc153SStefano Zampini   if (!v) {
4023e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4024e61fc153SStefano Zampini     goto finalize;
40257e8381f9SStefano Zampini   }
4026e61fc153SStefano Zampini   n = cusp->cooPerm->size();
402708391a17SStefano Zampini   if (isCudaMem(v)) {
402808391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
402908391a17SStefano Zampini   } else {
4030e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4031e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
403208391a17SStefano Zampini     d_v = cooPerm_v->data();
40339566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
403408391a17SStefano Zampini   }
40359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4036e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4037ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4038bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
403908391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4040ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4041ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4042ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4043ddea5d60SJunchao Zhang       */
4044e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4045e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4046e61fc153SStefano Zampini       delete cooPerm_w;
40477e8381f9SStefano Zampini     } else {
4048ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40499371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40509371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4051ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40527e8381f9SStefano Zampini     }
40537e8381f9SStefano Zampini   } else {
4054e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
405508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4056e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40577e8381f9SStefano Zampini     } else {
40589371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40599371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40607e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40617e8381f9SStefano Zampini     }
40627e8381f9SStefano Zampini   }
40639566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4064e61fc153SStefano Zampini finalize:
4065e61fc153SStefano Zampini   delete cooPerm_v;
40667e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40679566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4068fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40699566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40709566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40719566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4072fcdce8c4SStefano Zampini   a->reallocs = 0;
4073fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4074fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4075fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4076fcdce8c4SStefano Zampini   A->num_ass++;
40777e8381f9SStefano Zampini   PetscFunctionReturn(0);
40787e8381f9SStefano Zampini }
40797e8381f9SStefano Zampini 
40809371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4081a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4082a49f1ed0SStefano Zampini 
4083a49f1ed0SStefano Zampini   PetscFunctionBegin;
4084a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4085a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4086a49f1ed0SStefano Zampini   if (destroy) {
40879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4088a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4089a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4090a49f1ed0SStefano Zampini   }
40911a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4092a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4093a49f1ed0SStefano Zampini }
4094a49f1ed0SStefano Zampini 
40957e8381f9SStefano Zampini #include <thrust/binary_search.h>
4096219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
40979371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
40987e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
40997e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
41007e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
41017e8381f9SStefano Zampini 
41027e8381f9SStefano Zampini   PetscFunctionBegin;
41039566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
41049566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
41057e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
41067e8381f9SStefano Zampini   if (n != cooPerm_n) {
41077e8381f9SStefano Zampini     delete cusp->cooPerm;
41087e8381f9SStefano Zampini     delete cusp->cooPerm_a;
41097e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41107e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41117e8381f9SStefano Zampini   }
41127e8381f9SStefano Zampini   if (n) {
4113e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4114e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4115e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4116e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4117e8729f6fSJunchao Zhang 
4118e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4119e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4120e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4121e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4122e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4123e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4124e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4125e8729f6fSJunchao Zhang     } else {
4126e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4127e8729f6fSJunchao Zhang     }
4128e8729f6fSJunchao Zhang 
4129e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4130e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4131e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4132e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4133e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4134e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4135e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4136e8729f6fSJunchao Zhang     } else {
4137e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4138e8729f6fSJunchao Zhang     }
4139e8729f6fSJunchao Zhang 
41407e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41417e8381f9SStefano Zampini 
4142ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4143ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
41447e8381f9SStefano Zampini 
4145ddea5d60SJunchao Zhang     /* Ex.
4146ddea5d60SJunchao Zhang       n = 6
4147ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4148ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4149ddea5d60SJunchao Zhang     */
4150e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4151e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41527e8381f9SStefano Zampini 
41539566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41547e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4155ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4156e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4157e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41587e8381f9SStefano Zampini 
4159ddea5d60SJunchao Zhang     /*
4160ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4161ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4162ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4163ddea5d60SJunchao Zhang     */
4164ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4165ddea5d60SJunchao Zhang 
4166ddea5d60SJunchao Zhang     /*
4167ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4168ddea5d60SJunchao Zhang                             ^ekey
4169ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4170ddea5d60SJunchao Zhang                            ^nekye
4171ddea5d60SJunchao Zhang     */
41727e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41737e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41747e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4175ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4176ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4177ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4178ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4179ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41807e8381f9SStefano Zampini       w[0]                  = 0;
4181ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4182ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41837e8381f9SStefano Zampini     }
41847e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4185e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4186ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4187ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41889566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41897e8381f9SStefano Zampini 
41909566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41917e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41927e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41937e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41949566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4195ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
41969566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41977e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4198fcdce8c4SStefano Zampini     a->rmax          = 0;
41999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
42009566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4201e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42029566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
42039566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
42047e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
42057e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
42067e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
42077e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4208fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
42097e8381f9SStefano Zampini     }
4210fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42117e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42129566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42139566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4214e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4215e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42167e8381f9SStefano Zampini   } else {
42179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42187e8381f9SStefano Zampini   }
42199566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42207e8381f9SStefano Zampini 
42217e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4222e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42239566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42249566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42257e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42287e8381f9SStefano Zampini   PetscFunctionReturn(0);
42297e8381f9SStefano Zampini }
4230ed502f03SStefano Zampini 
42319371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4232219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4233219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4234cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4235219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4236219fbbafSJunchao Zhang 
4237219fbbafSJunchao Zhang   PetscFunctionBegin;
42389566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42399566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4240219fbbafSJunchao Zhang   if (coo_i) {
42419566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4242219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4243219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42449371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42459371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42469371c9d4SSatish Balay           break;
42479371c9d4SSatish Balay         }
4248219fbbafSJunchao Zhang       }
4249219fbbafSJunchao Zhang     }
4250219fbbafSJunchao Zhang   }
4251219fbbafSJunchao Zhang 
4252219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42539566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4254219fbbafSJunchao Zhang   } else {
42559566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4256cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4258219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4259219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4264219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4265219fbbafSJunchao Zhang   }
4266219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4267219fbbafSJunchao Zhang }
4268219fbbafSJunchao Zhang 
42699371c9d4SSatish Balay __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4270219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4271219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4272b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4273b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4274b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4275b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4276b6c38306SJunchao Zhang   }
4277219fbbafSJunchao Zhang }
4278219fbbafSJunchao Zhang 
42799371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4280219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4281219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4282219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4283219fbbafSJunchao Zhang   PetscMemType        memtype;
4284219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4285219fbbafSJunchao Zhang   PetscScalar        *Aa;
4286219fbbafSJunchao Zhang 
4287219fbbafSJunchao Zhang   PetscFunctionBegin;
4288219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42899566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4290219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
42929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4293219fbbafSJunchao Zhang     }
4294219fbbafSJunchao Zhang 
42959566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
42969566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4297219fbbafSJunchao Zhang 
4298cbc6b225SStefano Zampini     if (Annz) {
4299b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
43009566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4301cbc6b225SStefano Zampini     }
4302219fbbafSJunchao Zhang 
43039566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
43049566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4305219fbbafSJunchao Zhang 
43069566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4307219fbbafSJunchao Zhang   } else {
43089566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4309219fbbafSJunchao Zhang   }
4310219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4311219fbbafSJunchao Zhang }
4312219fbbafSJunchao Zhang 
43135b7e41feSStefano Zampini /*@C
431411a5261eSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
43155b7e41feSStefano Zampini 
43165b7e41feSStefano Zampini    Not collective
43175b7e41feSStefano Zampini 
43185b7e41feSStefano Zampini     Input Parameters:
43195b7e41feSStefano Zampini +   A - the matrix
432011a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43215b7e41feSStefano Zampini 
43225b7e41feSStefano Zampini     Output Parameters:
43235b7e41feSStefano Zampini +   ia - the CSR row pointers
43245b7e41feSStefano Zampini -   ja - the CSR column indices
43255b7e41feSStefano Zampini 
43265b7e41feSStefano Zampini     Level: developer
43275b7e41feSStefano Zampini 
432811a5261eSBarry Smith     Note:
43295b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43305b7e41feSStefano Zampini 
4331db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43325b7e41feSStefano Zampini @*/
43339371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43345f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43355f101d05SStefano Zampini   CsrMatrix          *csr;
43365f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43375f101d05SStefano Zampini 
43385f101d05SStefano Zampini   PetscFunctionBegin;
43395f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43405f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
43415f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4342aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
434428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43455f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43465f101d05SStefano Zampini   if (i) {
43475f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43485f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43495f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43505f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43519566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43525f101d05SStefano Zampini       }
43535f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43545f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43555f101d05SStefano Zampini   }
43565f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43575f101d05SStefano Zampini   PetscFunctionReturn(0);
43585f101d05SStefano Zampini }
43595f101d05SStefano Zampini 
43605b7e41feSStefano Zampini /*@C
436111a5261eSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
43625b7e41feSStefano Zampini 
43635b7e41feSStefano Zampini    Not collective
43645b7e41feSStefano Zampini 
43655b7e41feSStefano Zampini     Input Parameters:
43665b7e41feSStefano Zampini +   A - the matrix
436711a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43685b7e41feSStefano Zampini 
43695b7e41feSStefano Zampini     Output Parameters:
43705b7e41feSStefano Zampini +   ia - the CSR row pointers
43715b7e41feSStefano Zampini -   ja - the CSR column indices
43725b7e41feSStefano Zampini 
43735b7e41feSStefano Zampini     Level: developer
43745b7e41feSStefano Zampini 
4375db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
43765b7e41feSStefano Zampini @*/
43779371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43785f101d05SStefano Zampini   PetscFunctionBegin;
43795f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43805f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43815f101d05SStefano Zampini   if (i) *i = NULL;
43825f101d05SStefano Zampini   if (j) *j = NULL;
43835f101d05SStefano Zampini   PetscFunctionReturn(0);
43845f101d05SStefano Zampini }
43855f101d05SStefano Zampini 
43865b7e41feSStefano Zampini /*@C
438711a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
43885b7e41feSStefano Zampini 
43895b7e41feSStefano Zampini    Not Collective
43905b7e41feSStefano Zampini 
43915b7e41feSStefano Zampini    Input Parameter:
439211a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
43935b7e41feSStefano Zampini 
43945b7e41feSStefano Zampini    Output Parameter:
43955b7e41feSStefano Zampini .   a - pointer to the device data
43965b7e41feSStefano Zampini 
43975b7e41feSStefano Zampini    Level: developer
43985b7e41feSStefano Zampini 
439911a5261eSBarry Smith    Note:
440011a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44015b7e41feSStefano Zampini 
4402db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
44035b7e41feSStefano Zampini @*/
44049371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4405ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4406ed502f03SStefano Zampini   CsrMatrix          *csr;
4407ed502f03SStefano Zampini 
4408ed502f03SStefano Zampini   PetscFunctionBegin;
4409ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4410ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4411ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4412aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
441428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4415ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
441628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4417ed502f03SStefano Zampini   *a = csr->values->data().get();
4418ed502f03SStefano Zampini   PetscFunctionReturn(0);
4419ed502f03SStefano Zampini }
4420ed502f03SStefano Zampini 
44215b7e41feSStefano Zampini /*@C
442211a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44235b7e41feSStefano Zampini 
44245b7e41feSStefano Zampini    Not Collective
44255b7e41feSStefano Zampini 
44265b7e41feSStefano Zampini    Input Parameter:
442711a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44285b7e41feSStefano Zampini 
44295b7e41feSStefano Zampini    Output Parameter:
44305b7e41feSStefano Zampini .   a - pointer to the device data
44315b7e41feSStefano Zampini 
44325b7e41feSStefano Zampini    Level: developer
44335b7e41feSStefano Zampini 
4434db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
44355b7e41feSStefano Zampini @*/
44369371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4437ed502f03SStefano Zampini   PetscFunctionBegin;
4438ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4439ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4440ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4441ed502f03SStefano Zampini   *a = NULL;
4442ed502f03SStefano Zampini   PetscFunctionReturn(0);
4443ed502f03SStefano Zampini }
4444ed502f03SStefano Zampini 
44455b7e41feSStefano Zampini /*@C
444611a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44475b7e41feSStefano Zampini 
44485b7e41feSStefano Zampini    Not Collective
44495b7e41feSStefano Zampini 
44505b7e41feSStefano Zampini    Input Parameter:
445111a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44525b7e41feSStefano Zampini 
44535b7e41feSStefano Zampini    Output Parameter:
44545b7e41feSStefano Zampini .   a - pointer to the device data
44555b7e41feSStefano Zampini 
44565b7e41feSStefano Zampini    Level: developer
44575b7e41feSStefano Zampini 
445811a5261eSBarry Smith    Note:
445911a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44605b7e41feSStefano Zampini 
4461db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44625b7e41feSStefano Zampini @*/
44639371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4464039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4465039c6fbaSStefano Zampini   CsrMatrix          *csr;
4466039c6fbaSStefano Zampini 
4467039c6fbaSStefano Zampini   PetscFunctionBegin;
4468039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4469039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4470039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4471aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
447328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4474039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
447528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4476039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4477039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4479039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4480039c6fbaSStefano Zampini }
44815b7e41feSStefano Zampini /*@C
448211a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4483039c6fbaSStefano Zampini 
44845b7e41feSStefano Zampini    Not Collective
44855b7e41feSStefano Zampini 
44865b7e41feSStefano Zampini    Input Parameter:
448711a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44885b7e41feSStefano Zampini 
44895b7e41feSStefano Zampini    Output Parameter:
44905b7e41feSStefano Zampini .   a - pointer to the device data
44915b7e41feSStefano Zampini 
44925b7e41feSStefano Zampini    Level: developer
44935b7e41feSStefano Zampini 
4494db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
44955b7e41feSStefano Zampini @*/
44969371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4497039c6fbaSStefano Zampini   PetscFunctionBegin;
4498039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4499039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4500039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45029566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4503039c6fbaSStefano Zampini   *a = NULL;
4504039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4505039c6fbaSStefano Zampini }
4506039c6fbaSStefano Zampini 
45075b7e41feSStefano Zampini /*@C
450811a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45095b7e41feSStefano Zampini 
45105b7e41feSStefano Zampini    Not Collective
45115b7e41feSStefano Zampini 
45125b7e41feSStefano Zampini    Input Parameter:
451311a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45145b7e41feSStefano Zampini 
45155b7e41feSStefano Zampini    Output Parameter:
45165b7e41feSStefano Zampini .   a - pointer to the device data
45175b7e41feSStefano Zampini 
45185b7e41feSStefano Zampini    Level: developer
45195b7e41feSStefano Zampini 
452011a5261eSBarry Smith    Note:
452111a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
45225b7e41feSStefano Zampini 
4523db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45245b7e41feSStefano Zampini @*/
45259371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4526ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4527ed502f03SStefano Zampini   CsrMatrix          *csr;
4528ed502f03SStefano Zampini 
4529ed502f03SStefano Zampini   PetscFunctionBegin;
4530ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4531ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4532ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4533aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
453428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4535ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
453628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4537ed502f03SStefano Zampini   *a             = csr->values->data().get();
4538039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4540ed502f03SStefano Zampini   PetscFunctionReturn(0);
4541ed502f03SStefano Zampini }
4542ed502f03SStefano Zampini 
45435b7e41feSStefano Zampini /*@C
454411a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
45455b7e41feSStefano Zampini 
45465b7e41feSStefano Zampini    Not Collective
45475b7e41feSStefano Zampini 
45485b7e41feSStefano Zampini    Input Parameter:
454911a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45505b7e41feSStefano Zampini 
45515b7e41feSStefano Zampini    Output Parameter:
45525b7e41feSStefano Zampini .   a - pointer to the device data
45535b7e41feSStefano Zampini 
45545b7e41feSStefano Zampini    Level: developer
45555b7e41feSStefano Zampini 
4556db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
45575b7e41feSStefano Zampini @*/
45589371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4559ed502f03SStefano Zampini   PetscFunctionBegin;
4560ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4561ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4562ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45649566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4565ed502f03SStefano Zampini   *a = NULL;
4566ed502f03SStefano Zampini   PetscFunctionReturn(0);
4567ed502f03SStefano Zampini }
4568ed502f03SStefano Zampini 
45699371c9d4SSatish Balay struct IJCompare4 {
45709371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4571ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4572ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4573ed502f03SStefano Zampini     return false;
4574ed502f03SStefano Zampini   }
4575ed502f03SStefano Zampini };
4576ed502f03SStefano Zampini 
45779371c9d4SSatish Balay struct Shift {
4578ed502f03SStefano Zampini   int _shift;
4579ed502f03SStefano Zampini 
4580ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45819371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4582ed502f03SStefano Zampini };
4583ed502f03SStefano Zampini 
4584ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
45859371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4586ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4590ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4591ed502f03SStefano Zampini   cusparseStatus_t              stat;
4592ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4593ed502f03SStefano Zampini 
4594ed502f03SStefano Zampini   PetscFunctionBegin;
4595ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4596ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4597ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4598ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
46005f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
460108401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4605ed502f03SStefano Zampini     m = A->rmap->n;
4606ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
46079566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
46089566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46099566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4611ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4613ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4614ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4615ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4616ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4617ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4618ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4619ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4620ed502f03SStefano Zampini     Ccusp->nrows            = m;
4621ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4622ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4623ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4624ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46259566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46269566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46279566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
463628b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
463728b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4638ed502f03SStefano Zampini 
4639ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4640ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4641ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4642ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4643ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4644ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4645ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4647ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4648ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4649ed502f03SStefano Zampini     if (c->nz) {
46502ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46512ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46522ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46532ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46542ed87e7eSStefano Zampini 
4655ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4656ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4657ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46599566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660ed502f03SStefano Zampini         }
46612ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46622ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4663ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4664ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4665ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46679566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668ed502f03SStefano Zampini         }
46692ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46702ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46719566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46729371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46739371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46749371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46759371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46762ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46772ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46782ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46798909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
46828909a122SStefano Zampini #else
46838909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
46848909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
46858909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
46868909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
46878909a122SStefano Zampini #endif
46882ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
46892ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
46902ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
46912ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
46922ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
46932ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4695ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4696ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4697792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
46988909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
46998909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
47008909a122SStefano Zampini #endif
47012ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
47022ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
47032ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4704792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
47052ed87e7eSStefano Zampini #else
47062ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4707792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47092ed87e7eSStefano Zampini #endif
47109371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47119371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47129566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47132ed87e7eSStefano Zampini       delete wPerm;
47142ed87e7eSStefano Zampini       delete Acoo;
47152ed87e7eSStefano Zampini       delete Bcoo;
47162ed87e7eSStefano Zampini       delete Ccoo;
4717ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47189371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4720ed502f03SStefano Zampini #endif
47211a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47229566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47239566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4727ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4729ed502f03SStefano Zampini 
47301a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47311a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4732a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4733ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4734ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4735ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4736ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4737ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4738ed502f03SStefano Zampini 
4739ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4740ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4742ed502f03SStefano Zampini 
47439566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4744ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4745ed502f03SStefano Zampini         if (AT) {
4746ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747ed502f03SStefano Zampini           thrust::advance(rT, -1);
4748ed502f03SStefano Zampini         }
4749ed502f03SStefano Zampini         if (BT) {
4750ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4753ed502f03SStefano Zampini         }
4754ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4755ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4758ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47609566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4761ed502f03SStefano Zampini 
47629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47729371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47739371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4774ed502f03SStefano Zampini #endif
4775ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4776ed502f03SStefano Zampini       }
4777ed502f03SStefano Zampini     }
4778ed502f03SStefano Zampini 
4779ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4780ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4781ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
47829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
47839566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4784ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4788ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
47899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791ed502f03SStefano Zampini     } else {
47929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794ed502f03SStefano Zampini     }
47959566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
47969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
47979566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4798ed502f03SStefano Zampini     c->maxnz         = c->nz;
4799ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4800ed502f03SStefano Zampini     c->rmax          = 0;
4801ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4802ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4803ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4804ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4805ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4806ed502f03SStefano Zampini     }
48079566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
48089566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4809ed502f03SStefano Zampini     (*C)->nonzerostate++;
48109566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48119566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4812ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4813ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4814ed502f03SStefano Zampini   } else {
481508401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4817ed502f03SStefano Zampini     if (c->nz) {
4818ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48195f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
482108401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48229566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48239566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48245f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48255f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4827ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48335f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4835ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48369566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48379371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48389371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48409371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48419371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48441a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48455f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4851ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48531a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4854ed502f03SStefano Zampini       }
48559566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4856ed502f03SStefano Zampini     }
4857ed502f03SStefano Zampini   }
48589566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4860ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4861ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4862ed502f03SStefano Zampini   PetscFunctionReturn(0);
4863ed502f03SStefano Zampini }
4864c215019aSStefano Zampini 
48659371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4866c215019aSStefano Zampini   bool               dmem;
4867c215019aSStefano Zampini   const PetscScalar *av;
4868c215019aSStefano Zampini 
4869c215019aSStefano Zampini   PetscFunctionBegin;
4870c215019aSStefano Zampini   dmem = isCudaMem(v);
48719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4872c215019aSStefano Zampini   if (n && idx) {
4873c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4874c215019aSStefano Zampini     widx.assign(idx, idx + n);
48759566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4876c215019aSStefano Zampini 
4877c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4878c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4879c215019aSStefano Zampini     if (dmem) {
4880c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4881c215019aSStefano Zampini     } else {
4882c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4883c215019aSStefano Zampini       dv = w->data();
4884c215019aSStefano Zampini     }
4885c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4886c215019aSStefano Zampini 
4887c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4888c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4889c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
489048a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4891c215019aSStefano Zampini     delete w;
4892c215019aSStefano Zampini   } else {
48939566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4894c215019aSStefano Zampini   }
48959566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
48969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4897c215019aSStefano Zampini   PetscFunctionReturn(0);
4898c215019aSStefano Zampini }
4899